int MPID_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_GET_ACCUMULATE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_GET_ACCUMULATE); mpi_errno = MPIDI_CH3I_Get_accumulate(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr, NULL); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE); mpi_errno = MPIDI_CH3U_Win_fns.gather_info(base, size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if ((*win_ptr)->info_args.alloc_shm == TRUE && MPIDI_CH3U_Win_fns.detect_shm != NULL) { /* Detect if shared buffers are specified for the processes in the * current node. If so, enable shm RMA.*/ mpi_errno = MPIDI_CH3U_Win_fns.detect_shm(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); goto fn_exit; } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_Win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_INIT); (*win_ptr)->shm_base_addr = NULL; (*win_ptr)->shm_segment_len = 0; (*win_ptr)->shm_segment_handle = 0; (*win_ptr)->shm_mutex = NULL; (*win_ptr)->shm_mutex_segment_handle = 0; (*win_ptr)->info_shm_base_addr = NULL; (*win_ptr)->info_shm_segment_len = 0; (*win_ptr)->info_shm_segment_handle = 0; fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_INIT); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, void *baseptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); if ((*win_ptr)->info_args.alloc_shm == TRUE) { if (MPIDI_CH3U_Win_fns.allocate_shm != NULL) { mpi_errno = MPIDI_CH3U_Win_fns.allocate_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); goto fn_exit; } } mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); return mpi_errno; fn_fail: goto fn_exit; }
static int MPIDI_CH3I_Win_detect_shm(MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Win *shm_win_ptr = NULL; int i, node_size; MPI_Aint *base_shm_offs; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { goto fn_exit; } node_size = (*win_ptr)->comm_ptr->node_comm->local_size; MPIR_CHKLMEM_MALLOC(base_shm_offs, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "base_shm_offs"); /* Return the first matched shared window. * It is noted that the shared windows including all local processes are * stored in every local process in the same order, hence the first matched * shared window on every local process should be the same. */ mpi_errno = MPIDI_CH3I_SHM_Wins_match(win_ptr, &shm_win_ptr, &base_shm_offs); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (shm_win_ptr == NULL) goto fn_exit; (*win_ptr)->shm_allocated = TRUE; MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* Compute the base address of shm buffer on each process. * shm_base_addrs[i] = my_shm_base_addr + off[i] */ for (i = 0; i < node_size; i++) { (*win_ptr)->shm_base_addrs[i] = (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i]); } /* TODO: should we use the same mutex or create a new one ? * It causes unnecessary synchronization.*/ (*win_ptr)->shm_mutex = shm_win_ptr->shm_mutex; fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_Win_hooks_init(MPIDI_CH3U_Win_hooks_t * win_hooks) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT); if (MPIDI_CH3I_Shm_supported()) { win_hooks->win_init = MPIDI_CH3I_Win_init; win_hooks->win_free = MPIDI_CH3_SHM_Win_free; } MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3_WIN_HOOKS_INIT); return mpi_errno; }
int MPID_Win_detach(MPIR_Win * win, const void *base) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_WIN_DETACH); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_WIN_DETACH); /* no op, all of memory is exposed */ fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_WIN_DETACH); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT); if (MPIDI_CH3I_Shm_supported()) { win_fns->allocate_shm = MPIDI_CH3I_Win_allocate_shm; win_fns->detect_shm = MPIDI_CH3I_Win_detect_shm; win_fns->gather_info = MPIDI_CH3I_Win_gather_info; win_fns->shared_query = MPIDI_CH3_SHM_Win_shared_query; } MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3_WIN_FNS_INIT); return mpi_errno; }
int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t * win_fns) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_WIN_FNS_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_WIN_FNS_INIT); win_fns->create = MPIDI_CH3U_Win_create; win_fns->allocate = MPIDI_CH3U_Win_allocate; win_fns->allocate_shared = MPIDI_CH3U_Win_allocate; win_fns->create_dynamic = MPIDI_CH3U_Win_create_dynamic; win_fns->gather_info = MPIDI_CH3U_Win_gather_info; win_fns->shared_query = MPIDI_CH3U_Win_shared_query; MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_WIN_FNS_INIT); return mpi_errno; }
int MPIDI_CH3U_Win_create_dynamic(MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); mpi_errno = MPIDI_CH3U_Win_fns.gather_info(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_PUT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_PUT); mpi_errno = MPIDI_CH3I_Put(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr, NULL); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_PUT); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; void **base_pp = (void **) base_ptr; int i, node_size, node_rank; MPIR_Comm *node_comm_ptr; MPI_Aint *node_sizes; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int noncontig = FALSE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr); goto fn_exit; } /* see if we can allocate all windows contiguously */ noncontig = (*win_ptr)->info_args.alloc_shared_noncontig; (*win_ptr)->shm_allocated = TRUE; /* When allocating shared memory region segment, we need comm of processes * that are on the same node as this process (node_comm). * If node_comm == NULL, this process is the only one on this node, therefore * we use comm_self as node comm. */ node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* get the sizes of the windows and window objectsof * all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "node_sizes"); /* FIXME: This needs to be fixed for heterogeneous systems */ node_sizes[node_rank] = (MPI_Aint) size; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, node_sizes, sizeof(MPI_Aint), MPI_BYTE, node_comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); (*win_ptr)->shm_segment_len = 0; for (i = 0; i < node_size; i++) { if (noncontig) /* Round up to next page size */ (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]); else (*win_ptr)->shm_segment_len += node_sizes[i]; } if ((*win_ptr)->shm_segment_len == 0) { (*win_ptr)->base = NULL; } else { mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* Allocated the interprocess mutex segment. */ mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* compute the base addresses of each process within the shared memory segment */ { char *cur_base; int cur_rank; cur_base = (*win_ptr)->shm_base_addr; cur_rank = 0; ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr; for (i = 1; i < node_size; ++i) { if (node_sizes[i]) { /* For the base addresses, we track the previous * process that has allocated non-zero bytes of shared * memory. We can not simply use "i-1" for the * previous process because rank "i-1" might not have * allocated any memory. */ if (noncontig) { ((*win_ptr)->shm_base_addrs)[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]); } else { ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank]; } cur_base = ((*win_ptr)->shm_base_addrs)[i]; cur_rank = i; } else { ((*win_ptr)->shm_base_addrs)[i] = NULL; } } } (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank]; } *base_pp = (*win_ptr)->base; /* gather window information among processes via shared memory region. */ mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* Cache SHM windows */ MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr)); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { MPIR_Comm *node_comm_ptr = NULL; int node_rank; int comm_rank, comm_size; MPI_Aint *tmp_buf = NULL; int i, k; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int mpi_errno = MPI_SUCCESS; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr); goto fn_exit; } comm_size = (*win_ptr)->comm_ptr->local_size; comm_rank = (*win_ptr)->comm_ptr->rank; node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_rank = node_comm_ptr->rank; (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t); mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->info_shm_segment_handle); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map. */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->info_shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr); MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); tmp_buf[4 * comm_rank] = MPIR_Ptr_to_aint(base); tmp_buf[4 * comm_rank + 1] = size; tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { /* only node_rank == 0 writes results to basic_info_table on shared memory region. */ k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } } /* Make sure that all local processes see the results written by node_rank == 0 */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_SHM_Wins_match(MPIR_Win ** win_ptr, MPIR_Win ** matched_win, MPI_Aint ** base_shm_offs_ptr) { int mpi_errno = MPI_SUCCESS; int i, comm_size; int node_size, node_rank, shm_node_size; int group_diff; int base_diff; MPIR_Comm *node_comm_ptr = NULL, *shm_node_comm_ptr = NULL; int *node_ranks = NULL, *node_ranks_in_shm_node = NULL; MPIR_Group *node_group_ptr = NULL, *shm_node_group_ptr = NULL; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPI_Aint *base_shm_offs; MPIDI_SHM_Win_t *elem = shm_wins_list; MPIR_CHKLMEM_DECL(2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); *matched_win = NULL; base_shm_offs = *base_shm_offs_ptr; node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; comm_size = (*win_ptr)->comm_ptr->local_size; MPIR_CHKLMEM_MALLOC(node_ranks, int *, node_size * sizeof(int), mpi_errno, "node_ranks"); MPIR_CHKLMEM_MALLOC(node_ranks_in_shm_node, int *, node_size * sizeof(int), mpi_errno, "node_ranks_in_shm_comm"); for (i = 0; i < node_size; i++) { node_ranks[i] = i; } mpi_errno = MPIR_Comm_group_impl(node_comm_ptr, &node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); while (elem != NULL) { MPIR_Win *shm_win = elem->win; if (!shm_win) MPIDI_SHM_Wins_next_and_continue(elem); /* Compare node_comm. * * Only support shm if new node_comm is equal to or a subset of shm node_comm. * Shm node_comm == a subset of node_comm is not supported, because it means * some processes of node_comm cannot be shared, but RMA operation simply checks * the node_id of a target process for distinguishing shm target. */ shm_node_comm_ptr = shm_win->comm_ptr->node_comm; shm_node_size = shm_node_comm_ptr->local_size; if (node_size > shm_node_size) MPIDI_SHM_Wins_next_and_continue(elem); mpi_errno = MPIR_Comm_group_impl(shm_win->comm_ptr, &shm_node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_translate_ranks_impl(node_group_ptr, node_size, node_ranks, shm_node_group_ptr, node_ranks_in_shm_node); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); shm_node_group_ptr = NULL; group_diff = 0; for (i = 0; i < node_size; i++) { /* not exist in shm_comm->node_comm */ if (node_ranks_in_shm_node[i] == MPI_UNDEFINED) { group_diff = 1; break; } } if (group_diff) MPIDI_SHM_Wins_next_and_continue(elem); /* Gather the offset of base_addr from all local processes. Match only * when all of them are included in the shm segment in current shm_win. * * Note that this collective call must be called after checking the * group match in order to guarantee all the local processes can perform * this call. */ base_shm_offs[node_rank] = (MPI_Aint) ((*win_ptr)->base) - (MPI_Aint) (shm_win->shm_base_addr); mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, base_shm_offs, 1, MPI_AINT, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); base_diff = 0; for (i = 0; i < comm_size; ++i) { int i_node_rank = (*win_ptr)->comm_ptr->intranode_table[i]; if (i_node_rank >= 0) { MPIR_Assert(i_node_rank < node_size); if (base_shm_offs[i_node_rank] < 0 || base_shm_offs[i_node_rank] + (*win_ptr)->basic_info_table[i].size > shm_win->shm_segment_len) { base_diff = 1; break; } } } if (base_diff) MPIDI_SHM_Wins_next_and_continue(elem); /* Found the first matched shm_win */ *matched_win = shm_win; break; } fn_exit: if (node_group_ptr != NULL) mpi_errno = MPIR_Group_free_impl(node_group_ptr); /* Only free it here when group_translate_ranks fails. */ if (shm_node_group_ptr != NULL) mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr); MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPIR_Win * win_ptr, MPIR_Request * ureq) { int mpi_errno = MPI_SUCCESS; intptr_t orig_data_sz, target_data_sz; int rank; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPIDU_Datatype*dtp; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp, dt_true_lb); if (target_data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt; MPI_Aint origin_type_size; MPI_Aint target_type_size; int use_immed_pkt = FALSE, i; int is_origin_contig, is_target_contig, is_result_contig; MPI_Aint stream_elem_count, stream_unit_count; MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent; MPIDU_Datatype*origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL; int is_empty_origin = FALSE; /* Judge if origin buffer is empty */ if (op == MPI_NO_OP) is_empty_origin = TRUE; /* Append the operation to the window's RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->result_addr = result_addr; op_ptr->result_count = result_count; op_ptr->result_datatype = result_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPIDU_Datatype_get_ptr(origin_datatype, origin_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) { MPIDU_Datatype_get_ptr(result_datatype, result_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPIDU_Datatype_get_ptr(target_datatype, target_dtp); } if (is_empty_origin == FALSE) { MPIDU_Datatype_get_size_macro(origin_datatype, origin_type_size); MPIR_Assign_trunc(orig_data_sz, origin_count * origin_type_size, intptr_t); } else { /* If origin buffer is empty, set origin data size to 0 */ orig_data_sz = 0; } MPIDU_Datatype_get_size_macro(target_datatype, target_type_size); /* Get size and count for predefined datatype elements */ if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { predefined_dtp_size = target_type_size; predefined_dtp_count = target_count; MPIDU_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent); } else { MPIR_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL); MPIDU_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size); predefined_dtp_count = target_data_sz / predefined_dtp_size; MPIDU_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent); } MPIR_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0); /* Calculate number of predefined elements in each stream unit, and * total number of stream units. */ stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent; stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1; MPIR_Assert(stream_elem_count > 0 && stream_unit_count > 0); for (i = 0; i < stream_unit_count; i++) { if (origin_dtp != NULL) { MPIDU_Datatype_add_ref(origin_dtp); } if (target_dtp != NULL) { MPIDU_Datatype_add_ref(target_dtp); } if (result_dtp != NULL) { MPIDU_Datatype_add_ref(result_dtp); } } if (is_empty_origin == FALSE) { MPIDU_Datatype_is_contig(origin_datatype, &is_origin_contig); } else { /* If origin buffer is empty, mark origin data as contig data */ is_origin_contig = 1; } MPIDU_Datatype_is_contig(target_datatype, &is_target_contig); MPIDU_Datatype_is_contig(result_datatype, &is_result_contig); /* Judge if we can use IMMED data packet */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig && is_result_contig) { if (target_data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is a piggyback candidate */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for origin, target and result data. We should extend this optimization to derived * datatypes as well. */ if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ get_accum_pkt = &(op_ptr->pkt.get_accum); if (use_immed_pkt) { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED); } else { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM); } get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; get_accum_pkt->count = target_count; get_accum_pkt->datatype = target_datatype; get_accum_pkt->info.dataloop_size = 0; get_accum_pkt->op = op; get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data); mpi_errno = immed_copy(src, dest, orig_data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPIR_Win * win_ptr, MPIR_Request * ureq) { int mpi_errno = MPI_SUCCESS; int dt_contig ATTRIBUTE((unused)), rank; MPIDU_Datatype*dtp; MPI_Aint dt_true_lb ATTRIBUTE((unused)); intptr_t data_sz; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_PUT); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* If the put is a local operation, do it here */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_put_t *put_pkt = NULL; int use_immed_pkt = FALSE; int is_origin_contig, is_target_contig; /* queue it up */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ /* FIXME: For contig and very short operations, use a streamlined op */ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPIDU_Datatype_get_ptr(origin_datatype, dtp); MPIDU_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPIDU_Datatype_get_ptr(target_datatype, dtp); MPIDU_Datatype_add_ref(dtp); } MPIDU_Datatype_is_contig(origin_datatype, &is_origin_contig); MPIDU_Datatype_is_contig(target_datatype, &is_target_contig); /* Judge if we can use IMMED data packet */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) { if (data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is an piggyback candidate */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for both origin and target data. We should extend this optimization to derived * datatypes as well. */ if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ put_pkt = &(op_ptr->pkt.put); if (use_immed_pkt) { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED); } else { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT); } put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; put_pkt->count = target_count; put_pkt->datatype = target_datatype; put_pkt->info.dataloop_size = 0; put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; put_pkt->source_win_handle = win_ptr->handle; put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data); mpi_errno = immed_copy(src, dest, data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_PUT); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *, comm_size * sizeof(MPIDI_Win_basic_info_t), mpi_errno, "(*win_ptr)->basic_info_table"); /* get the addresses of the windows, window objects, and completion * counters of all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the * origin, we'd also need the window size. */ tmp_buf[4 * rank] = MPIR_Ptr_to_aint(base); tmp_buf[4 * rank + 1] = size; tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Win_free(MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int in_use; MPIR_Comm *comm_ptr; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_WIN_FREE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_WIN_FREE); MPIR_ERR_CHKANDJUMP(((*win_ptr)->states.access_state != MPIDI_RMA_NONE && (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_ISSUED && (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_GRANTED) || ((*win_ptr)->states.exposure_state != MPIDI_RMA_NONE), mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); /* 1. Here we must wait until all passive locks are released on this target, * because for some UNLOCK messages, we do not send ACK back to origin, * we must wait until lock is released so that we can free window. * 2. We also need to wait until AT completion counter being zero, because * this counter is increment everytime we meet a GET-like operation, it is * possible that when target entering Win_free, passive epoch is not finished * yet and there are still GETs doing on this target. * 3. We also need to wait until lock queue becomes empty. It is possible * that some lock requests is still waiting in the queue when target is * entering Win_free. */ while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE || (*win_ptr)->at_completion_counter != 0 || (*win_ptr)->target_lock_queue_head != NULL || (*win_ptr)->current_target_lock_data_bytes != 0 || (*win_ptr)->sync_request_cnt != 0) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } mpi_errno = MPID_Barrier((*win_ptr)->comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Free window resources in lower layer. */ if (MPIDI_CH3U_Win_hooks.win_free != NULL) { mpi_errno = MPIDI_CH3U_Win_hooks.win_free(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } /* dequeue window from the global list */ MPIR_Assert((*win_ptr)->active == FALSE); DL_DELETE(MPIDI_RMA_Win_inactive_list_head, (*win_ptr)); if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) { /* this is the last window, de-register RMA progress hook */ mpi_errno = MPID_Progress_deregister_hook(MPIDI_CH3I_RMA_Progress_hook_id); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } comm_ptr = (*win_ptr)->comm_ptr; mpi_errno = MPIR_Comm_free_impl(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if ((*win_ptr)->basic_info_table != NULL) MPL_free((*win_ptr)->basic_info_table); MPL_free((*win_ptr)->op_pool_start); MPL_free((*win_ptr)->target_pool_start); MPL_free((*win_ptr)->slots); MPL_free((*win_ptr)->target_lock_entry_pool_start); MPIR_Assert((*win_ptr)->current_target_lock_data_bytes == 0); /* Free the attached buffer for windows created with MPI_Win_allocate() */ if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE || (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) { if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) { MPL_free((*win_ptr)->base); } } MPIR_Object_release_ref(*win_ptr, &in_use); /* MPI windows don't have reference count semantics, so this should always be true */ MPIR_Assert(!in_use); MPIR_Handle_obj_free(&MPIR_Win_mem, *win_ptr); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_WIN_FREE); return mpi_errno; fn_fail: goto fn_exit; }
int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr, void *result_addr, MPI_Datatype datatype, int target_rank, MPI_Aint target_disp, MPIR_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; int rank; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* The datatype must be predefined, and one of: C integer, Fortran integer, * Logical, Multi-language types, or Byte. This is checked above the ADI, * so there's no need to check it again here. */ /* FIXME: For shared memory windows, we should provide an implementation * that uses a processor atomic operation. */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr, datatype, target_rank, target_disp, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL; MPI_Aint type_size; void *src = NULL, *dest = NULL; /* Append this operation to the RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = 1; op_ptr->origin_datatype = datatype; op_ptr->result_addr = result_addr; op_ptr->result_datatype = datatype; op_ptr->compare_addr = (void *) compare_addr; op_ptr->compare_datatype = datatype; op_ptr->target_rank = target_rank; op_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */ /************** Setting packet struct areas in operation ****************/ cas_pkt = &(op_ptr->pkt.cas); MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED); cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; cas_pkt->datatype = datatype; cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; /* REQUIRE: All datatype arguments must be of the same, builtin * type and counts must be 1. */ MPIDU_Datatype_get_size_macro(datatype, type_size); MPIR_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u)); src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }