/* comm shrink impl; assumes that standard error checking has already taken * place in the calling function */ int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Group *global_failed, *comm_grp, *new_group_ptr; int attempts = 0; int errflag = 0, tmp_errflag = 0; MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK); MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_SHRINK); /* TODO - Implement this function for intercommunicators */ MPIR_Comm_group_impl(comm_ptr, &comm_grp); do { mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG); /* Ignore the mpi_errno value here as it will definitely communicate * with failed procs */ mpi_errno = MPIR_Group_difference_impl(comm_grp, global_failed, &new_group_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (MPID_Group_empty != global_failed) MPIR_Group_release(global_failed); mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr); errflag = mpi_errno || *newcomm_ptr == NULL; mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, &tmp_errflag); MPIR_Group_release(new_group_ptr); if (errflag) MPIU_Object_set_ref(new_group_ptr, 0); } while (errflag && ++attempts < 5); if (errflag && attempts >= 5) goto fn_fail; else mpi_errno = MPI_SUCCESS; fn_exit: MPIR_Group_release(comm_grp); MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_SHRINK); return mpi_errno; fn_fail: if (*newcomm_ptr) MPIU_Object_set_ref(*newcomm_ptr, 0); MPIU_Object_set_ref(global_failed, 0); MPIU_Object_set_ref(new_group_ptr, 0); goto fn_exit; }
int MPIR_Comm_agree(MPIR_Comm *comm_ptr, int *flag) { int mpi_errno = MPI_SUCCESS, mpi_errno_tmp = MPI_SUCCESS; MPIR_Group *comm_grp, *failed_grp, *new_group_ptr, *global_failed; int result, success = 1; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int values[2]; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_AGREE); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_AGREE); MPIR_Comm_group_impl(comm_ptr, &comm_grp); /* Get the locally known (not acknowledged) group of failed procs */ mpi_errno = MPID_Comm_failure_get_acked(comm_ptr, &failed_grp); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* First decide on the group of failed procs. */ mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_AGREE_TAG); if (mpi_errno) errflag = MPIR_ERR_PROC_FAILED; mpi_errno = MPIR_Group_compare_impl(failed_grp, global_failed, &result); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Create a subgroup without the failed procs */ mpi_errno = MPIR_Group_difference_impl(comm_grp, global_failed, &new_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* If that group isn't the same as what we think is failed locally, then * mark it as such. */ if (result == MPI_UNEQUAL || errflag) success = 0; /* Do an allreduce to decide whether or not anyone thinks the group * has changed */ mpi_errno_tmp = MPIR_Allreduce_group(MPI_IN_PLACE, &success, 1, MPI_INT, MPI_MIN, comm_ptr, new_group_ptr, MPIR_AGREE_TAG, &errflag); if (!success || errflag || mpi_errno_tmp) success = 0; values[0] = success; values[1] = *flag; /* Determine both the result of this function (mpi_errno) and the result * of flag that will be returned to the user. */ MPIR_Allreduce_group(MPI_IN_PLACE, values, 2, MPI_INT, MPI_BAND, comm_ptr, new_group_ptr, MPIR_AGREE_TAG, &errflag); /* Ignore the result of the operation this time. Everyone will either * return a failure because of !success earlier or they will return * something useful for flag because of this operation. If there was a new * failure in between the first allreduce and the second one, it's ignored * here. */ if (failed_grp != MPIR_Group_empty) MPIR_Group_release(failed_grp); MPIR_Group_release(new_group_ptr); MPIR_Group_release(comm_grp); if (global_failed != MPIR_Group_empty) MPIR_Group_release(global_failed); success = values[0]; *flag = values[1]; if (!success) { MPIR_ERR_SET(mpi_errno_tmp, MPIX_ERR_PROC_FAILED, "**mpix_comm_agree"); MPIR_ERR_ADD(mpi_errno, mpi_errno_tmp); } fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_AGREE); return mpi_errno; fn_fail: goto fn_exit; }
/* comm shrink impl; assumes that standard error checking has already taken * place in the calling function */ int MPIR_Comm_shrink(MPIR_Comm * comm_ptr, MPIR_Comm ** newcomm_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Group *global_failed = NULL, *comm_grp = NULL, *new_group_ptr = NULL; int attempts = 0; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_SHRINK); /* TODO - Implement this function for intercommunicators */ MPIR_Comm_group_impl(comm_ptr, &comm_grp); do { errflag = MPIR_ERR_NONE; MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG); /* Ignore the mpi_errno value here as it will definitely communicate * with failed procs */ mpi_errno = MPIR_Group_difference_impl(comm_grp, global_failed, &new_group_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (MPIR_Group_empty != global_failed) MPIR_Group_release(global_failed); mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr); if (*newcomm_ptr == NULL) { errflag = MPIR_ERR_PROC_FAILED; } else if (mpi_errno) { errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_Comm_release(*newcomm_ptr); } mpi_errno = MPII_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, &errflag); MPIR_Group_release(new_group_ptr); if (errflag) { if (*newcomm_ptr != NULL && MPIR_Object_get_ref(*newcomm_ptr) > 0) { MPIR_Object_set_ref(*newcomm_ptr, 1); MPIR_Comm_release(*newcomm_ptr); } if (MPIR_Object_get_ref(new_group_ptr) > 0) { MPIR_Object_set_ref(new_group_ptr, 1); MPIR_Group_release(new_group_ptr); } } } while (errflag && ++attempts < 5); if (errflag && attempts >= 5) goto fn_fail; else mpi_errno = MPI_SUCCESS; fn_exit: MPIR_Group_release(comm_grp); MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_SHRINK); return mpi_errno; fn_fail: if (*newcomm_ptr) MPIR_Object_set_ref(*newcomm_ptr, 0); MPIR_Object_set_ref(global_failed, 0); MPIR_Object_set_ref(new_group_ptr, 0); goto fn_exit; }