static int barrier_smp_intra(MPID_Comm *comm_ptr, mpir_errflag_t *errflag) { int mpi_errno=MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER && MPIR_Comm_is_node_aware(comm_ptr)); /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ if (comm_ptr->node_comm != NULL) { int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIU_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
/** * \brief MPI-PAMI glue for MPI_Win_create function * * Create a window object. Allocates a MPID_Win object and initializes it, * then allocates the collective info array, initalizes our entry, and * performs an Allgather to distribute/collect the rest of the array entries. * * ON first call, initializes (registers) protocol objects for locking, * get, and send operations to message layer. Also creates datatype to * represent the rma_sends element of the collective info array, * used later to synchronize epoch end events. * * \param[in] base Local window buffer * \param[in] size Local window size * \param[in] disp_unit Displacement unit size * \param[in] info Window hints (not used) * \param[in] comm_ptr Communicator * \param[out] win_ptr Window * \return MPI_SUCCESS, MPI_ERR_OTHER, or error returned from * MPI_Comm_dup or MPI_Allgather. */ int MPID_Win_create(void * base, MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int rc = MPI_SUCCESS; MPID_Win *win; size_t rank; MPIDI_Win_info *winfo; rc=MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_CREATE, MPI_WIN_UNIFIED); win = *win_ptr; win->base = base; rank = comm_ptr->rank; winfo = &win->mpid.info[rank]; winfo->base_addr = base; winfo->win = win; winfo->disp_unit = disp_unit; rc= MPIDI_Win_allgather(size,win_ptr); if (rc != MPI_SUCCESS) return rc; mpi_errno = MPIR_Barrier_impl(comm_ptr, &mpi_errno); return mpi_errno; }
int MPID_Win_set_info(MPID_Win *win, MPID_Info *info) { int mpi_errno = MPI_SUCCESS; mpi_errno = MPIDI_Win_set_info(win, info); MPID_assert(mpi_errno == MPI_SUCCESS); mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno); return mpi_errno; }
/** * \brief MPI-PAMI glue for MPI_Win_allocate function * * Create a window object. Allocates a MPID_Win object and initializes it, * then allocates the collective info array, initalizes our entry, and * performs an Allgather to distribute/collect the rest of the array entries. * On each process, it allocates memory of at least size bytes, returns a * pointer to it, and returns a window object that can be used by all processes * in comm to * perform RMA operations. The returned memory consists of size * bytes local to each process, starting at address base_ptr and is associated * with the window as if the user called 'MPI_Win_create' on existing memory. * The size argument may be different at each process and size = 0 is valid; * however, a library might allocate and expose more memory in order to create * a fast, globally symmetric allocation. * Input Parameters: * \param[in] size size of window in bytes (nonnegative integer) * \param[in] disp_unit local unit size for displacements, in bytes (positive integer) * \param[in] info info argument (handle)) * \param[in] comm_ptr Communicator (handle) * \param[out] base_ptr - base address of the window in local memory * \param[out] win_ptr window object returned by the call (handle) * \return MPI_SUCCESS, MPI_ERR_ARG, MPI_ERR_COMM, MPI_ERR_INFO. MPI_ERR_OTHER, * MPI_ERR_SIZE */ int MPID_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, void *base_ptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int rc = MPI_SUCCESS; mpir_errflag_t errflag = MPIR_ERR_NONE; void *baseP; static char FCNAME[] = "MPID_Win_allocate"; MPIDI_Win_info *winfo; MPID_Win *win; int rank; rc=MPIDI_Win_init(size,disp_unit,win_ptr, info, comm_ptr, MPI_WIN_FLAVOR_ALLOCATE, MPI_WIN_UNIFIED); win = *win_ptr; if (size > 0) { baseP = MPIU_Malloc(size); #ifndef MPIDI_NO_ASSERT MPID_assert(baseP != NULL); #else MPIU_ERR_CHKANDJUMP((baseP == NULL), mpi_errno, MPI_ERR_BUFFER, "**bufnull"); #endif } else if (size == 0) { baseP = NULL; } else { MPIU_ERR_CHKANDSTMT(size >=0 , mpi_errno, MPI_ERR_SIZE, return mpi_errno, "**rmasize"); } win->base = baseP; rank = comm_ptr->rank; winfo = &win->mpid.info[rank]; winfo->base_addr = baseP; winfo->win = win; winfo->disp_unit = disp_unit; rc= MPIDI_Win_allgather(size,win_ptr); if (rc != MPI_SUCCESS) return rc; *(void**) base_ptr = (void *) win->base; mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag); fn_fail: return mpi_errno; }
int MPID_Win_fence(int assert, MPID_Win *win) { int mpi_errno = MPI_SUCCESS; struct MPIDI_Win_sync* sync = &win->mpid.sync; MPID_PROGRESS_WAIT_WHILE(sync->total != sync->complete); sync->total = 0; sync->started = 0; sync->complete = 0; mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &mpi_errno); return mpi_errno; }
int MPID_Win_free(MPID_Win **win_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Win *win = *win_ptr; size_t rank = win->comm_ptr->rank; mpir_errflag_t errflag = MPIR_ERR_NONE; if(win->mpid.sync.origin_epoch_type != win->mpid.sync.target_epoch_type || (win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_NONE && win->mpid.sync.origin_epoch_type != MPID_EPOTYPE_REFENCE)) { MPIU_ERR_SETANDSTMT(mpi_errno, MPI_ERR_RMA_SYNC, return mpi_errno, "**rmasync"); } mpi_errno = MPIR_Barrier_impl(win->comm_ptr, &errflag); MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**mpi_bcast"); if (win->create_flavor == MPI_WIN_FLAVOR_SHARED) mpi_errno=MPIDI_SHM_Win_free(win_ptr); if (win->create_flavor == MPI_WIN_FLAVOR_ALLOCATE) MPIU_Free(win->base); struct MPIDI_Win_info *winfo = &win->mpid.info[rank]; #ifdef USE_PAMI_RDMA if (win->size != 0) { pami_result_t rc;
static int barrier_smp_intra(MPID_Comm *comm_ptr, MPIR_Errflag_t *errflag) { int mpi_errno=MPI_SUCCESS; int mpi_errno_ret = MPI_SUCCESS; MPIU_Assert(MPIR_CVAR_ENABLE_SMP_COLLECTIVES && MPIR_CVAR_ENABLE_SMP_BARRIER && MPIR_Comm_is_node_aware(comm_ptr)); #if defined(FINEGRAIN_MPI) int colocated_size = -1; int colocated_sense = -1; /* do barrier on osproc_colocated_comm */ if (comm_ptr->osproc_colocated_comm != NULL) { colocated_size = comm_ptr->osproc_colocated_comm->local_size; MPIU_Assert( (comm_ptr->osproc_colocated_comm->co_shared_vars != NULL) && (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars != NULL) ); MPIU_Assert(colocated_size > 1 ); colocated_sense = comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal; if( comm_ptr->osproc_colocated_comm->rank != 0 ) { /* non-leader */ (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter)++; if (comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter == (colocated_size-1)){ /* excluding the leader */ comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 1; } while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal == colocated_sense) { FG_Yield(); } } else { /* leader */ while(comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal == 0) { FG_Yield(); } } #if 0 /* Non-optimized version */ mpi_errno = MPIR_Barrier_impl(comm_ptr->osproc_colocated_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } #endif } #endif /* do the intranode barrier on all nodes */ if (comm_ptr->node_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* do the barrier across roots of all nodes */ if (comm_ptr->node_roots_comm != NULL) { mpi_errno = MPIR_Barrier_impl(comm_ptr->node_roots_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* release the local processes on each node with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ if (comm_ptr->node_comm != NULL) { int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->node_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } #if defined(FINEGRAIN_MPI) if (comm_ptr->osproc_colocated_comm != NULL) { if (comm_ptr->osproc_colocated_comm->rank == 0) { /* leader */ comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->leader_signal = 0; comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_counter = 0; comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal = 1 - comm_ptr->osproc_colocated_comm->co_shared_vars->co_barrier_vars->coproclet_signal; } #if 0 /* Non-optimized version */ /* release the colocated processes in each OS-process with a 1-byte broadcast (0-byte broadcast just returns without doing anything) */ int i=0; mpi_errno = MPIR_Bcast_impl(&i, 1, MPI_BYTE, 0, comm_ptr->osproc_colocated_comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIU_ERR_SET(mpi_errno, *errflag, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } #endif } #endif fn_exit: if (mpi_errno_ret) mpi_errno = mpi_errno_ret; else if (*errflag != MPIR_ERR_NONE) MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail"); return mpi_errno; fn_fail: goto fn_exit; }
/*@ MPI_Barrier - Blocks until all processes in the communicator have reached this routine. Input Parameters: . comm - communicator (handle) Notes: Blocks the caller until all processes in the communicator have called it; that is, the call returns at any process only after all members of the communicator have entered the call. .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM @*/ int MPI_Barrier( MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; MPID_Comm *comm_ptr = NULL; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPID_MPI_STATE_DECL(MPID_STATE_MPI_BARRIER); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPID_MPI_COLL_FUNC_ENTER(MPID_STATE_MPI_BARRIER); /* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm, mpi_errno); } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* Convert MPI object handles to object pointers */ MPID_Comm_get_ptr( comm, comm_ptr ); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate communicator */ MPID_Comm_valid_ptr( comm_ptr, mpi_errno, FALSE ); if (mpi_errno) goto fn_fail; } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Barrier_impl(comm_ptr, &errflag); if (mpi_errno) goto fn_fail; /* ... end of body of routine ... */ fn_exit: MPID_MPI_COLL_FUNC_EXIT(MPID_STATE_MPI_BARRIER); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_barrier", "**mpi_barrier %C", comm); } # endif mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno ); goto fn_exit; /* --END ERROR HANDLING-- */ }
/** * \brief Shut down the system * * At this time, no attempt is made to free memory being used for MPI structures. * \return MPI_SUCCESS */ int MPID_Finalize() { pami_result_t rc; int mpierrno = MPI_SUCCESS; mpir_errflag_t errflag=MPIR_ERR_NONE; MPIR_Barrier_impl(MPIR_Process.comm_world, &errflag); #ifdef MPIDI_STATISTICS if (MPIDI_Process.mp_statistics) { MPIDI_print_statistics(); } MPIDI_close_pe_extension(); #endif #ifdef DYNAMIC_TASKING mpidi_finalized = 1; if(mpidi_dynamic_tasking) { /* Tell the process group code that we're done with the process groups. This will notify PMI (with PMI_Finalize) if necessary. It also frees all PG structures, including the PG for COMM_WORLD, whose pointer is also saved in MPIDI_Process.my_pg */ mpierrno = MPIDI_PG_Finalize(); if (mpierrno) { TRACE_ERR("MPIDI_PG_Finalize returned with mpierrno=%d\n", mpierrno); } MPIDI_FreeParentPort(); } if(_conn_info_list) MPIU_Free(_conn_info_list); MPIDI_free_all_tranid_node(); #endif /* ------------------------- */ /* shutdown request queues */ /* ------------------------- */ MPIDI_Recvq_finalize(); PAMIX_Finalize(MPIDI_Client); #ifdef MPID_NEEDS_ICOMM_WORLD MPIR_Comm_release_always(MPIR_Process.icomm_world, 0); #endif MPIR_Comm_release_always(MPIR_Process.comm_self,0); MPIR_Comm_release_always(MPIR_Process.comm_world,0); rc = PAMI_Context_destroyv(MPIDI_Context, MPIDI_Process.avail_contexts); MPID_assert_always(rc == PAMI_SUCCESS); rc = PAMI_Client_destroy(&MPIDI_Client); MPID_assert_always(rc == PAMI_SUCCESS); #ifdef MPIDI_TRACE { int i; for (i=0; i< MPIDI_Process.numTasks; i++) { if (MPIDI_Trace_buf[i].R) MPIU_Free(MPIDI_Trace_buf[i].R); if (MPIDI_Trace_buf[i].PR) MPIU_Free(MPIDI_Trace_buf[i].PR); if (MPIDI_Trace_buf[i].S) MPIU_Free(MPIDI_Trace_buf[i].S); } } MPIU_Free(MPIDI_Trace_buf); #endif #ifdef OUT_OF_ORDER_HANDLING MPIU_Free(MPIDI_In_cntr); MPIU_Free(MPIDI_Out_cntr); #endif if (TOKEN_FLOW_CONTROL_ON) { #if TOKEN_FLOW_CONTROL extern char *EagerLimit; if (EagerLimit) MPIU_Free(EagerLimit); MPIU_Free(MPIDI_Token_cntr); MPIDI_close_mm(); #else MPID_assert_always(0); #endif } return MPI_SUCCESS; }
/* This function produces topology aware trees for reduction and broadcasts, with different * K values. This is a heavy-weight function as it allocates shared memory, generates topology * information, builds a package-level tree (for package leaders), and a per-package tree. * These are combined in shared memory for other ranks to read out from. * */ int MPIDI_SHM_topology_tree_init(MPIR_Comm * comm_ptr, int root, int bcast_k, MPIR_Treealgo_tree_t * bcast_tree, int *bcast_topotree_fail, int reduce_k, MPIR_Treealgo_tree_t * reduce_tree, int *reduce_topotree_fail, MPIR_Errflag_t * errflag) { int *shared_region; MPL_shm_hnd_t fd; int num_ranks, rank; int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; size_t shm_size; int **bind_map = NULL; int *max_entries_per_level = NULL; int **ranks_per_package = NULL; int *package_ctr = NULL; size_t topo_depth = 0; int package_level = 0, i, max_ranks_per_package = 0; bool mapfail_flag = false; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); num_ranks = MPIR_Comm_size(comm_ptr); rank = MPIR_Comm_rank(comm_ptr); /* Calculate the size of shared memory that would be needed */ shm_size = sizeof(int) * 5 * num_ranks + num_ranks * sizeof(cpu_set_t); /* STEP 1. Create shared memory region for exchanging topology information (root only) */ mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, shm_size, &fd, (void **) &shared_region, &mapfail_flag); if (mpi_errno || mapfail_flag) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* STEP 2. Collect cpu_sets for each rank at the root */ cpu_set_t my_cpu_set; CPU_ZERO(&my_cpu_set); sched_getaffinity(0, sizeof(my_cpu_set), &my_cpu_set); ((cpu_set_t *) (shared_region))[rank] = my_cpu_set; mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* STEP 3. Root has all the cpu_set information, now build tree */ if (rank == root) { topo_depth = hwloc_topology_get_depth(MPIR_Process.hwloc_topology); bind_map = (int **) MPL_malloc(num_ranks * sizeof(int *), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!bind_map, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < num_ranks; ++i) { bind_map[i] = (int *) MPL_calloc(topo_depth, sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!bind_map[i], mpi_errno, MPI_ERR_OTHER, "**nomem"); } MPIDI_SHM_hwloc_init_bindmap(num_ranks, topo_depth, shared_region, bind_map); /* Done building the topology information */ /* STEP 3.1. Count the maximum entries at each level - used for breaking the tree into * intra/inter socket */ max_entries_per_level = (int *) MPL_calloc(topo_depth, sizeof(size_t), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!max_entries_per_level, mpi_errno, MPI_ERR_OTHER, "**nomem"); package_level = MPIDI_SHM_topotree_get_package_level(topo_depth, max_entries_per_level, num_ranks, bind_map); if (MPIDI_SHM_TOPOTREE_DEBUG) fprintf(stderr, "Breaking topology at :: %d (default= %d)\n", package_level, MPIDI_SHM_TOPOTREE_CUTOFF); /* STEP 3.2. allocate space for the entries that go in each package based on hwloc info */ ranks_per_package = (int **) MPL_malloc(max_entries_per_level[package_level] * sizeof(int *), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!ranks_per_package, mpi_errno, MPI_ERR_OTHER, "**nomem"); package_ctr = (int *) MPL_calloc(max_entries_per_level[package_level], sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!package_ctr, mpi_errno, MPI_ERR_OTHER, "**nomem"); for (i = 0; i < max_entries_per_level[package_level]; ++i) { package_ctr[i] = 0; ranks_per_package[i] = (int *) MPL_calloc(num_ranks, sizeof(int), MPL_MEM_OTHER); MPIR_ERR_CHKANDJUMP(!ranks_per_package[i], mpi_errno, MPI_ERR_OTHER, "**nomem"); } /* sort the ranks into packages based on the binding information */ for (i = 0; i < num_ranks; ++i) { int package = bind_map[i][package_level]; ranks_per_package[package][package_ctr[package]++] = i; } max_ranks_per_package = 0; for (i = 0; i < max_entries_per_level[package_level]; ++i) { max_ranks_per_package = MPL_MAX(max_ranks_per_package, package_ctr[i]); } /* At this point we have done the common work in extracting topology information * and restructuring it to our needs. Now we generate the tree. */ /* For Bcast, package leaders are added before the package local ranks, and the per_package * tree is left_skewed */ mpi_errno = MPIDI_SHM_gen_tree(bcast_k, shared_region, max_entries_per_level, ranks_per_package, max_ranks_per_package, package_ctr, package_level, num_ranks, 1 /*package_leaders_first */ , 0 /*left_skewed */ , errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Every rank copies their tree out from shared memory */ MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, bcast_tree, bcast_topotree_fail); if (MPIDI_SHM_TOPOTREE_DEBUG) MPIDI_SHM_print_topotree_file("BCAST", comm_ptr->context_id, rank, bcast_tree); /* Wait until shared memory is available */ mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Generate the reduce tree */ /* For Reduce, package leaders are added after the package local ranks, and the per_package * tree is right_skewed (children are added in the reverse order */ if (rank == root) { memset(shared_region, 0, shm_size); mpi_errno = MPIDI_SHM_gen_tree(reduce_k, shared_region, max_entries_per_level, ranks_per_package, max_ranks_per_package, package_ctr, package_level, num_ranks, 0 /*package_leaders_last */ , 1 /*right_skewed */ , errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* each rank copy the reduce tree out */ MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, reduce_tree, reduce_topotree_fail); if (MPIDI_SHM_TOPOTREE_DEBUG) MPIDI_SHM_print_topotree_file("REDUCE", comm_ptr->context_id, rank, reduce_tree); /* Wait for all ranks to copy out the tree */ mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } /* Cleanup */ if (rank == root) { for (i = 0; i < max_entries_per_level[package_level]; ++i) { MPL_free(ranks_per_package[i]); } MPL_free(ranks_per_package); MPL_free(package_ctr); if (MPIDI_SHM_TOPOTREE_DEBUG) for (i = 0; i < topo_depth; ++i) { fprintf(stderr, "Level :: %d, Max :: %d\n", i, max_entries_per_level[i]); } for (i = 0; i < num_ranks; ++i) { MPL_free(bind_map[i]); } MPL_free(max_entries_per_level); MPL_free(bind_map); } MPIDIU_destroy_shm_segment(shm_size, &fd, (void **) &shared_region); fn_exit: if (rank == root && MPIDI_SHM_TOPOTREE_DEBUG) fprintf(stderr, "Done creating tree for %d\n", num_ranks); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_TOPOLOGY_TREE_INIT); return mpi_errno; fn_fail: goto fn_exit; }
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; void **base_pp = (void **) base_ptr; int i, node_size, node_rank; MPIR_Comm *node_comm_ptr; MPI_Aint *node_sizes; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int noncontig = FALSE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr); goto fn_exit; } /* see if we can allocate all windows contiguously */ noncontig = (*win_ptr)->info_args.alloc_shared_noncontig; (*win_ptr)->shm_allocated = TRUE; /* When allocating shared memory region segment, we need comm of processes * that are on the same node as this process (node_comm). * If node_comm == NULL, this process is the only one on this node, therefore * we use comm_self as node comm. */ node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* get the sizes of the windows and window objectsof * all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "node_sizes"); /* FIXME: This needs to be fixed for heterogeneous systems */ node_sizes[node_rank] = (MPI_Aint) size; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, node_sizes, sizeof(MPI_Aint), MPI_BYTE, node_comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); (*win_ptr)->shm_segment_len = 0; for (i = 0; i < node_size; i++) { if (noncontig) /* Round up to next page size */ (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]); else (*win_ptr)->shm_segment_len += node_sizes[i]; } if ((*win_ptr)->shm_segment_len == 0) { (*win_ptr)->base = NULL; } else { mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* Allocated the interprocess mutex segment. */ mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* compute the base addresses of each process within the shared memory segment */ { char *cur_base; int cur_rank; cur_base = (*win_ptr)->shm_base_addr; cur_rank = 0; ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr; for (i = 1; i < node_size; ++i) { if (node_sizes[i]) { /* For the base addresses, we track the previous * process that has allocated non-zero bytes of shared * memory. We can not simply use "i-1" for the * previous process because rank "i-1" might not have * allocated any memory. */ if (noncontig) { ((*win_ptr)->shm_base_addrs)[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]); } else { ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank]; } cur_base = ((*win_ptr)->shm_base_addrs)[i]; cur_rank = i; } else { ((*win_ptr)->shm_base_addrs)[i] = NULL; } } } (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank]; } *base_pp = (*win_ptr)->base; /* gather window information among processes via shared memory region. */ mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* Cache SHM windows */ MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr)); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { MPIR_Comm *node_comm_ptr = NULL; int node_rank; int comm_rank, comm_size; MPI_Aint *tmp_buf = NULL; int i, k; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int mpi_errno = MPI_SUCCESS; MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr); goto fn_exit; } comm_size = (*win_ptr)->comm_ptr->local_size; comm_rank = (*win_ptr)->comm_ptr->rank; node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_rank = node_comm_ptr->rank; (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t); mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->info_shm_segment_handle); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map. */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->info_shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->info_shm_segment_handle, (*win_ptr)->info_shm_segment_len, (char **) &(*win_ptr)->info_shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr); MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); tmp_buf[4 * comm_rank] = MPIR_Ptr_to_aint(base); tmp_buf[4 * comm_rank + 1] = size; tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { /* only node_rank == 0 writes results to basic_info_table on shared memory region. */ k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } } /* Make sure that all local processes see the results written by node_rank == 0 */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Win_free(MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int in_use; MPIR_Comm *comm_ptr; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_WIN_FREE); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPID_WIN_FREE); MPIR_ERR_CHKANDJUMP(((*win_ptr)->states.access_state != MPIDI_RMA_NONE && (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_ISSUED && (*win_ptr)->states.access_state != MPIDI_RMA_FENCE_GRANTED) || ((*win_ptr)->states.exposure_state != MPIDI_RMA_NONE), mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); /* 1. Here we must wait until all passive locks are released on this target, * because for some UNLOCK messages, we do not send ACK back to origin, * we must wait until lock is released so that we can free window. * 2. We also need to wait until AT completion counter being zero, because * this counter is increment everytime we meet a GET-like operation, it is * possible that when target entering Win_free, passive epoch is not finished * yet and there are still GETs doing on this target. * 3. We also need to wait until lock queue becomes empty. It is possible * that some lock requests is still waiting in the queue when target is * entering Win_free. */ while ((*win_ptr)->current_lock_type != MPID_LOCK_NONE || (*win_ptr)->at_completion_counter != 0 || (*win_ptr)->target_lock_queue_head != NULL || (*win_ptr)->current_target_lock_data_bytes != 0 || (*win_ptr)->sync_request_cnt != 0) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } mpi_errno = MPIR_Barrier_impl((*win_ptr)->comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Free window resources in lower layer. */ if (MPIDI_CH3U_Win_hooks.win_free != NULL) { mpi_errno = MPIDI_CH3U_Win_hooks.win_free(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } /* dequeue window from the global list */ MPIR_Assert((*win_ptr)->active == FALSE); MPL_DL_DELETE(MPIDI_RMA_Win_inactive_list_head, (*win_ptr)); if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) { /* this is the last window, de-register RMA progress hook */ mpi_errno = MPID_Progress_deregister_hook(MPIDI_CH3I_RMA_Progress_hook_id); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } comm_ptr = (*win_ptr)->comm_ptr; mpi_errno = MPIR_Comm_free_impl(comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if ((*win_ptr)->basic_info_table != NULL) MPL_free((*win_ptr)->basic_info_table); MPL_free((*win_ptr)->op_pool_start); MPL_free((*win_ptr)->target_pool_start); MPL_free((*win_ptr)->slots); MPL_free((*win_ptr)->target_lock_entry_pool_start); MPIR_Assert((*win_ptr)->current_target_lock_data_bytes == 0); /* Free the attached buffer for windows created with MPI_Win_allocate() */ if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE || (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) { if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) { MPL_free((*win_ptr)->base); } } MPIR_Object_release_ref(*win_ptr, &in_use); /* MPI windows don't have reference count semantics, so this should always be true */ MPIR_Assert(!in_use); MPIR_Handle_obj_free(&MPIR_Win_mem, *win_ptr); fn_exit: MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPID_WIN_FREE); return mpi_errno; fn_fail: goto fn_exit; }