static int MPIDI_CH3I_SHM_Wins_match(MPIR_Win ** win_ptr, MPIR_Win ** matched_win, MPI_Aint ** base_shm_offs_ptr) { int mpi_errno = MPI_SUCCESS; int i, comm_size; int node_size, node_rank, shm_node_size; int group_diff; int base_diff; MPIR_Comm *node_comm_ptr = NULL, *shm_node_comm_ptr = NULL; int *node_ranks = NULL, *node_ranks_in_shm_node = NULL; MPIR_Group *node_group_ptr = NULL, *shm_node_group_ptr = NULL; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPI_Aint *base_shm_offs; MPIDI_SHM_Win_t *elem = shm_wins_list; MPIR_CHKLMEM_DECL(2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); *matched_win = NULL; base_shm_offs = *base_shm_offs_ptr; node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; comm_size = (*win_ptr)->comm_ptr->local_size; MPIR_CHKLMEM_MALLOC(node_ranks, int *, node_size * sizeof(int), mpi_errno, "node_ranks"); MPIR_CHKLMEM_MALLOC(node_ranks_in_shm_node, int *, node_size * sizeof(int), mpi_errno, "node_ranks_in_shm_comm"); for (i = 0; i < node_size; i++) { node_ranks[i] = i; } mpi_errno = MPIR_Comm_group_impl(node_comm_ptr, &node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); while (elem != NULL) { MPIR_Win *shm_win = elem->win; if (!shm_win) MPIDI_SHM_Wins_next_and_continue(elem); /* Compare node_comm. * * Only support shm if new node_comm is equal to or a subset of shm node_comm. * Shm node_comm == a subset of node_comm is not supported, because it means * some processes of node_comm cannot be shared, but RMA operation simply checks * the node_id of a target process for distinguishing shm target. */ shm_node_comm_ptr = shm_win->comm_ptr->node_comm; shm_node_size = shm_node_comm_ptr->local_size; if (node_size > shm_node_size) MPIDI_SHM_Wins_next_and_continue(elem); mpi_errno = MPIR_Comm_group_impl(shm_win->comm_ptr, &shm_node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_translate_ranks_impl(node_group_ptr, node_size, node_ranks, shm_node_group_ptr, node_ranks_in_shm_node); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); shm_node_group_ptr = NULL; group_diff = 0; for (i = 0; i < node_size; i++) { /* not exist in shm_comm->node_comm */ if (node_ranks_in_shm_node[i] == MPI_UNDEFINED) { group_diff = 1; break; } } if (group_diff) MPIDI_SHM_Wins_next_and_continue(elem); /* Gather the offset of base_addr from all local processes. Match only * when all of them are included in the shm segment in current shm_win. * * Note that this collective call must be called after checking the * group match in order to guarantee all the local processes can perform * this call. */ base_shm_offs[node_rank] = (MPI_Aint) ((*win_ptr)->base) - (MPI_Aint) (shm_win->shm_base_addr); mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, base_shm_offs, 1, MPI_AINT, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); base_diff = 0; for (i = 0; i < comm_size; ++i) { int i_node_rank = (*win_ptr)->comm_ptr->intranode_table[i]; if (i_node_rank >= 0) { MPIR_Assert(i_node_rank < node_size); if (base_shm_offs[i_node_rank] < 0 || base_shm_offs[i_node_rank] + (*win_ptr)->basic_info_table[i].size > shm_win->shm_segment_len) { base_diff = 1; break; } } } if (base_diff) MPIDI_SHM_Wins_next_and_continue(elem); /* Found the first matched shm_win */ *matched_win = shm_win; break; } fn_exit: if (node_group_ptr != NULL) mpi_errno = MPIR_Group_free_impl(node_group_ptr); /* Only free it here when group_translate_ranks fails. */ if (shm_node_group_ptr != NULL) mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr); MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
/*@ MPI_Group_translate_ranks - Translates the ranks of processes in one group to those in another group Input Parameters: + group1 - group1 (handle) . n - number of ranks in 'ranks1' and 'ranks2' arrays (integer) . ranks1 - array of zero or more valid ranks in 'group1' - group2 - group2 (handle) Output Parameters: . ranks2 - array of corresponding ranks in group2, 'MPI_UNDEFINED' when no correspondence exists. As a special case (see the MPI-2 errata), if the input rank is 'MPI_PROC_NULL', 'MPI_PROC_NULL' is given as the output rank. .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS @*/ int MPI_Group_translate_ranks(MPI_Group group1, int n, const int ranks1[], MPI_Group group2, int ranks2[]) { int mpi_errno = MPI_SUCCESS; MPIR_Group *group_ptr1 = NULL; MPIR_Group *group_ptr2 = NULL; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_GROUP_TRANSLATE_RANKS); MPIR_ERRTEST_INITIALIZED_ORDIE(); /* The routines that setup the group data structures must be executed * within a mutex. As most of the group routines are not performance * critical, we simple run these routines within the SINGLE_CS */ MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_GROUP_TRANSLATE_RANKS); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_GROUP(group1, mpi_errno); MPIR_ERRTEST_GROUP(group2, mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* Convert MPI object handles to object pointers */ MPIR_Group_get_ptr(group1, group_ptr1); MPIR_Group_get_ptr(group2, group_ptr2); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate group_ptr */ MPIR_Group_valid_ptr(group_ptr1, mpi_errno); MPIR_Group_valid_ptr(group_ptr2, mpi_errno); /* If either group_ptr is not valid, it will be reset to null */ MPIR_ERRTEST_ARGNEG(n, "n", mpi_errno); if (group_ptr1) { /* Check that the rank entries are valid */ int size1 = group_ptr1->size; int i; for (i = 0; i < n; i++) { if ((ranks1[i] < 0 && ranks1[i] != MPI_PROC_NULL) || ranks1[i] >= size1) { mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_RANK, "**rank", "**rank %d %d", ranks1[i], size1); goto fn_fail; } } } MPIR_ERRTEST_ARGNULL(ranks2, "ranks2", mpi_errno); } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Group_translate_ranks_impl(group_ptr1, n, ranks1, group_ptr2, ranks2); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* ... end of body of routine ... */ fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_GROUP_TRANSLATE_RANKS); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: { mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_group_translate_ranks", "**mpi_group_translate_ranks %G %d %p %G %p", group1, n, ranks1, group2, ranks2); } mpi_errno = MPIR_Err_return_comm(NULL, FCNAME, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Comm_failed_bitarray(MPID_Comm *comm_ptr, uint32_t **bitarray, int acked) { int mpi_errno = MPI_SUCCESS; int size, i; uint32_t bit; int *failed_procs, *group_procs; MPID_Group *failed_group, *comm_group; MPIU_CHKLMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_COMM_FAILED_BITARRAY); MPIDI_FUNC_ENTER(MPID_STATE_COMM_FAILED_BITARRAY); /* TODO - Fix this for intercommunicators */ size = comm_ptr->local_size; /* We can fit sizeof(uint32_t) * 8 ranks in one uint64_t so divide the * size by that */ /* This buffer will be handed back to the calling function so we use a * "real" malloc here and expect the caller to free the buffer later. The * other buffers in this function are temporary and will be automatically * cleaned up at the end of the function. */ *bitarray = (uint32_t *) MPIU_Malloc(sizeof(uint32_t) * (size / (sizeof(uint32_t) * 8)+1)); if (!(*bitarray)) { fprintf(stderr, "Could not allocate space for bitarray\n"); PMPI_Abort(MPI_COMM_WORLD, 1); } for (i = 0; i <= size/(sizeof(uint32_t)*8); i++) *bitarray[i] = 0; mpi_errno = MPIDI_CH3U_Check_for_failed_procs(); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (acked) MPIDI_CH3U_Get_failed_group(comm_ptr->dev.last_ack_rank, &failed_group); else MPIDI_CH3U_Get_failed_group(-2, &failed_group); if (failed_group == MPID_Group_empty) goto fn_exit; MPIU_CHKLMEM_MALLOC(group_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "group_procs"); for (i = 0; i < failed_group->size; i++) group_procs[i] = i; MPIU_CHKLMEM_MALLOC(failed_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "failed_procs"); MPIR_Comm_group_impl(comm_ptr, &comm_group); MPIR_Group_translate_ranks_impl(failed_group, failed_group->size, group_procs, comm_group, failed_procs); /* The bits will actually be ordered in decending order rather than * ascending. This is purely for readability since it makes no practical * difference. So if the bits look like this: * * 10001100 01001000 00000000 00000001 * * Then processes 1, 5, 6, 9, 12, and 32 have failed. */ for (i = 0; i < failed_group->size; i++) { bit = 0x80000000; bit >>= failed_procs[i] % (sizeof(uint32_t) * 8); *bitarray[failed_procs[i] / (sizeof(uint32_t) * 8)] |= bit; } MPIR_Group_free_impl(comm_group); fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_FUNC_EXIT(MPID_STATE_COMM_FAILED_BITARRAY); return mpi_errno; fn_fail: goto fn_exit; }