コード例 #1
0
/* comm shrink impl; assumes that standard error checking has already taken
 * place in the calling function */
int MPIR_Comm_shrink(MPID_Comm *comm_ptr, MPID_Comm **newcomm_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Group *global_failed, *comm_grp, *new_group_ptr;
    int attempts = 0;
    int errflag = 0, tmp_errflag = 0;

    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_SHRINK);
    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_SHRINK);

    /* TODO - Implement this function for intercommunicators */
    MPIR_Comm_group_impl(comm_ptr, &comm_grp);

    do {
        mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_SHRINK_TAG);
        /* Ignore the mpi_errno value here as it will definitely communicate
         * with failed procs */

        mpi_errno = MPIR_Group_difference_impl(comm_grp, global_failed, &new_group_ptr);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        if (MPID_Group_empty != global_failed) MPIR_Group_release(global_failed);

        mpi_errno = MPIR_Comm_create_group(comm_ptr, new_group_ptr, MPIR_SHRINK_TAG, newcomm_ptr);
        errflag = mpi_errno || *newcomm_ptr == NULL;

        mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &errflag, 1, MPI_INT, MPI_MAX, comm_ptr,
            new_group_ptr, MPIR_SHRINK_TAG, &tmp_errflag);
        MPIR_Group_release(new_group_ptr);

        if (errflag) MPIU_Object_set_ref(new_group_ptr, 0);
    } while (errflag && ++attempts < 5);

    if (errflag && attempts >= 5) goto fn_fail;
    else mpi_errno = MPI_SUCCESS;

  fn_exit:
    MPIR_Group_release(comm_grp);
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_SHRINK);
    return mpi_errno;
  fn_fail:
    if (*newcomm_ptr) MPIU_Object_set_ref(*newcomm_ptr, 0);
    MPIU_Object_set_ref(global_failed, 0);
    MPIU_Object_set_ref(new_group_ptr, 0);
    goto fn_exit;
}
コード例 #2
0
ファイル: comm_agree.c プロジェクト: michael-chuvelev/mpich
int MPIR_Comm_agree(MPIR_Comm *comm_ptr, int *flag)
{
    int mpi_errno = MPI_SUCCESS, mpi_errno_tmp = MPI_SUCCESS;
    MPIR_Group *comm_grp, *failed_grp, *new_group_ptr, *global_failed;
    int result, success = 1;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int values[2];

    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_AGREE);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_AGREE);

    MPIR_Comm_group_impl(comm_ptr, &comm_grp);

    /* Get the locally known (not acknowledged) group of failed procs */
    mpi_errno = MPID_Comm_failure_get_acked(comm_ptr, &failed_grp);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    /* First decide on the group of failed procs. */
    mpi_errno = MPID_Comm_get_all_failed_procs(comm_ptr, &global_failed, MPIR_AGREE_TAG);
    if (mpi_errno) errflag = MPIR_ERR_PROC_FAILED;

    mpi_errno = MPIR_Group_compare_impl(failed_grp, global_failed, &result);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    /* Create a subgroup without the failed procs */
    mpi_errno = MPIR_Group_difference_impl(comm_grp, global_failed, &new_group_ptr);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    /* If that group isn't the same as what we think is failed locally, then
     * mark it as such. */
    if (result == MPI_UNEQUAL || errflag)
        success = 0;

    /* Do an allreduce to decide whether or not anyone thinks the group
     * has changed */
    mpi_errno_tmp = MPIR_Allreduce_group(MPI_IN_PLACE, &success, 1, MPI_INT, MPI_MIN, comm_ptr,
                                         new_group_ptr, MPIR_AGREE_TAG, &errflag);
    if (!success || errflag || mpi_errno_tmp)
        success = 0;

    values[0] = success;
    values[1] = *flag;

    /* Determine both the result of this function (mpi_errno) and the result
     * of flag that will be returned to the user. */
    MPIR_Allreduce_group(MPI_IN_PLACE, values, 2, MPI_INT, MPI_BAND, comm_ptr,
                         new_group_ptr, MPIR_AGREE_TAG, &errflag);
    /* Ignore the result of the operation this time. Everyone will either
     * return a failure because of !success earlier or they will return
     * something useful for flag because of this operation. If there was a new
     * failure in between the first allreduce and the second one, it's ignored
     * here. */

    if (failed_grp != MPIR_Group_empty)
        MPIR_Group_release(failed_grp);
    MPIR_Group_release(new_group_ptr);
    MPIR_Group_release(comm_grp);
    if (global_failed != MPIR_Group_empty)
        MPIR_Group_release(global_failed);

    success = values[0];
    *flag = values[1];

    if (!success) {
        MPIR_ERR_SET(mpi_errno_tmp, MPIX_ERR_PROC_FAILED, "**mpix_comm_agree");
        MPIR_ERR_ADD(mpi_errno, mpi_errno_tmp);
    }

  fn_exit:
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_AGREE);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
コード例 #3
0
ファイル: contextid.c プロジェクト: michael-chuvelev/mpich
int MPIR_Get_contextid_sparse_group(MPIR_Comm * comm_ptr, MPIR_Group * group_ptr, int tag,
                                    MPIR_Context_id_t * context_id, int ignore_id)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    struct gcn_state st;
    struct gcn_state *tmp;
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_GET_CONTEXTID);

    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_GET_CONTEXTID);

    st.first_iter = 1;
    st.comm_ptr = comm_ptr;
    st.tag = tag;
    st.own_mask = 0;
    st.own_eager_mask = 0;
    /* Group-collective and ignore_id should never be combined */
    MPIR_Assert(!(group_ptr != NULL && ignore_id));

    *context_id = 0;

    MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST,
                                     "Entering; shared state is %d:%d, my ctx id is %d, tag=%d",
                                     mask_in_use, eager_in_use, comm_ptr->context_id, tag));

    while (*context_id == 0) {
        /* We lock only around access to the mask (except in the global locking
         * case).  If another thread is using the mask, we take a mask of zero. */
        MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        if (initialize_context_mask) {
            context_id_init();
        }

        if (eager_nelem < 0) {
            /* Ensure that at least one word of deadlock-free context IDs is
             * always set aside for the base protocol */
            MPIR_Assert(MPIR_CVAR_CTXID_EAGER_SIZE >= 0 &&
                        MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1);
            eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE;
        }

        if (ignore_id) {
            /* We are not participating in the resulting communicator, so our
             * context ID space doesn't matter.  Set the mask to "all available". */
            memset(st.local_mask, 0xff, MPIR_MAX_CONTEXT_MASK * sizeof(int));
            st.own_mask = 0;
            /* don't need to touch mask_in_use/lowest_context_id b/c our thread
             * doesn't ever need to "win" the mask */
        }

        /* Deadlock avoidance: Only participate in context id loop when all
         * processes have called this routine.  On the first iteration, use the
         * "eager" allocation protocol.
         */
        else if (st.first_iter) {
            memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
            st.own_eager_mask = 0;
            /* Attempt to reserve the eager mask segment */
            if (!eager_in_use && eager_nelem > 0) {
                int i;
                for (i = 0; i < eager_nelem; i++)
                    st.local_mask[i] = context_mask[i];

                eager_in_use = 1;
                st.own_eager_mask = 1;
            }
        }

        else {
            MPIR_Assert(next_gcn != NULL);
            /*If we are here, at least one element must be in the list, at least myself */

            /* only the first element in the list can own the mask. However, maybe the mask is used
             * by another thread, which added another allcoation to the list bevore. So we have to check,
             * if the mask is used and mark, if we own it */
            if (mask_in_use || &st != next_gcn) {
                memset(st.local_mask, 0, MPIR_MAX_CONTEXT_MASK * sizeof(int));
                st.own_mask = 0;
                MPL_DBG_MSG_FMT(MPIR_DBG_COMM, VERBOSE, (MPL_DBG_FDEST,
                                                 "Mask is in use, my context_id is %d, owner context id is %d",
                                                 st.comm_ptr->context_id,
                                                 next_gcn->comm_ptr->context_id));
            }
            else {
                int i;
                /* Copy safe mask segment to local_mask */
                for (i = 0; i < eager_nelem; i++)
                    st.local_mask[i] = 0;
                for (i = eager_nelem; i < MPIR_MAX_CONTEXT_MASK; i++)
                    st.local_mask[i] = context_mask[i];

                mask_in_use = 1;
                st.own_mask = 1;
                MPL_DBG_MSG(MPIR_DBG_COMM, VERBOSE, "Copied local_mask");
            }
        }
        MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        /* Note: MPIR_MAX_CONTEXT_MASK elements of local_mask are used by the
         * context ID allocation algorithm.  The additional element is ignored
         * by the context ID mask access routines and is used as a flag for
         * detecting context ID exhaustion (explained below). */
        if (st.own_mask || ignore_id)
            st.local_mask[ALL_OWN_MASK_FLAG] = 1;
        else
            st.local_mask[ALL_OWN_MASK_FLAG] = 0;

        /* Now, try to get a context id */
        MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM);
        /* In the global and brief-global cases, note that this routine will
         * release that global lock when it needs to wait.  That will allow
         * other processes to enter the global or brief global critical section.
         */
        if (group_ptr != NULL) {
            int coll_tag = tag | MPIR_Process.tagged_coll_mask; /* Shift tag into the tagged coll space */
            mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1,
                                             MPI_INT, MPI_BAND, comm_ptr, group_ptr, coll_tag,
                                             &errflag);
        }
        else {
            mpi_errno = MPID_Allreduce(MPI_IN_PLACE, st.local_mask, MPIR_MAX_CONTEXT_MASK + 1,
                                            MPI_INT, MPI_BAND, comm_ptr, &errflag);
        }
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* MT FIXME 2/3 cases don't seem to need the CONTEXTID CS, check and
         * narrow this region */
        MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
        if (ignore_id) {
            /* we don't care what the value was, but make sure that everyone
             * who did care agreed on a value */
            *context_id = locate_context_bit(st.local_mask);
            /* used later in out-of-context ids check and outer while loop condition */
        }
        else if (st.own_eager_mask) {
            /* There is a chance that we've found a context id */
            /* Find_and_allocate_context_id updates the context_mask if it finds a match */
            *context_id = find_and_allocate_context_id(st.local_mask);
            MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id);

            st.own_eager_mask = 0;
            eager_in_use = 0;
            if (*context_id <= 0) {
                /* else we did not find a context id. Give up the mask in case
                 * there is another thread (with a lower input context id)
                 * waiting for it.  We need to ensure that any other threads
                 * have the opportunity to run, hence yielding */
                /* FIXME: Do we need to do an GLOBAL yield here?
                 * When we do a collective operation, we anyway yield
                 * for other others */
                MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
                MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
        else if (st.own_mask) {
            /* There is a chance that we've found a context id */
            /* Find_and_allocate_context_id updates the context_mask if it finds a match */
            *context_id = find_and_allocate_context_id(st.local_mask);
            MPL_DBG_MSG_D(MPIR_DBG_COMM, VERBOSE, "Context id is now %hd", *context_id);

            mask_in_use = 0;

            if (*context_id > 0) {
                /* If we found a new context id, we have to remove the element from the list, so the
                 * next allocation can own the mask */
                if (next_gcn == &st) {
                    next_gcn = st.next;
                }
                else {
                    for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next);    /* avoid compiler warnings */
                    tmp->next = st.next;
                }
            }
            else {
                /* else we did not find a context id. Give up the mask in case
                 * there is another thread in the gcn_next_list
                 * waiting for it.  We need to ensure that any other threads
                 * have the opportunity to run, hence yielding */
                /* FIXME: Do we need to do an GLOBAL yield here?
                 * When we do a collective operation, we anyway yield
                 * for other others */
                MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
                MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
        else {
            /* As above, force this thread to yield */
            /* FIXME: Do we need to do an GLOBAL yield here?  When we
             * do a collective operation, we anyway yield for other
             * others */
            MPID_THREAD_CS_YIELD(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
            MPID_THREAD_CS_YIELD(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
        }
        MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);

        /* Test for context ID exhaustion: All threads that will participate in
         * the new communicator owned the mask and could not allocate a context
         * ID.  This indicates that either some process has no context IDs
         * available, or that some are available, but the allocation cannot
         * succeed because there is no common context ID. */
        if (*context_id == 0 && st.local_mask[ALL_OWN_MASK_FLAG] == 1) {
            /* --BEGIN ERROR HANDLING-- */
            int nfree = 0;
            int ntotal = 0;
            int minfree;

            if (st.own_mask) {
                MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
                mask_in_use = 0;
                MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }

            context_mask_stats(&nfree, &ntotal);
            if (ignore_id)
                minfree = INT_MAX;
            else
                minfree = nfree;

            if (group_ptr != NULL) {
                int coll_tag = tag | MPIR_Process.tagged_coll_mask;     /* Shift tag into the tagged coll space */
                mpi_errno = MPIR_Allreduce_group(MPI_IN_PLACE, &minfree, 1, MPI_INT, MPI_MIN,
                                                 comm_ptr, group_ptr, coll_tag, &errflag);
            }
            else {
                mpi_errno = MPID_Allreduce(MPI_IN_PLACE, &minfree, 1, MPI_INT,
                                                MPI_MIN, comm_ptr, &errflag);
            }

            if (minfree > 0) {
                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
                                     "**toomanycommfrag", "**toomanycommfrag %d %d %d",
                                     nfree, ntotal, ignore_id);
            }
            else {
                MPIR_ERR_SETANDJUMP3(mpi_errno, MPI_ERR_OTHER,
                                     "**toomanycomm", "**toomanycomm %d %d %d",
                                     nfree, ntotal, ignore_id);
            }
            /* --END ERROR HANDLING-- */
        }
        if (st.first_iter == 1) {
            st.first_iter = 0;
            /* to avoid deadlocks, the element is not added to the list bevore the first iteration */
            if (!ignore_id && *context_id == 0) {
                MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
                add_gcn_to_list(&st);
                MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
            }
        }
    }

  fn_exit:
    if (ignore_id)
        *context_id = MPIR_INVALID_CONTEXT_ID;
    MPL_DBG_MSG_S(MPIR_DBG_COMM, VERBOSE, "Context mask = %s", context_mask_to_str());
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_GET_CONTEXTID);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    /* Release the masks */
    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);
    if (st.own_mask) {
        mask_in_use = 0;
    }
    /*If in list, remove it */
    if (!st.first_iter && !ignore_id) {
        if (next_gcn == &st) {
            next_gcn = st.next;
        }
        else {
            for (tmp = next_gcn; tmp->next != &st; tmp = tmp->next);
            tmp->next = st.next;
        }
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_CTX_MUTEX);


    goto fn_exit;
    /* --END ERROR HANDLING-- */
}