int MPID_Comm_failure_get_acked(MPID_Comm *comm_ptr, MPID_Group **group_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Group *failed_group, *comm_group;
    MPIDI_STATE_DECL(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);

    /* Get the group of all failed processes */
    MPIDI_CH3U_Check_for_failed_procs();
    MPIDI_CH3U_Get_failed_group(comm_ptr->dev.last_ack_rank, &failed_group);
    if (failed_group == MPID_Group_empty) {
        *group_ptr = MPID_Group_empty;
        goto fn_exit;
    }

    MPIR_Comm_group_impl(comm_ptr, &comm_group);

    /* Get the intersection of all falied processes in this communicator */
    MPIR_Group_intersection_impl(failed_group, comm_group, group_ptr);

    MPIR_Group_release(comm_group);
    MPIR_Group_release(failed_group);

fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_COMM_FAILURE_GET_ACKED);
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Ejemplo n.º 2
0
int MPIDI_CH3U_Check_for_failed_procs(void)
{
    int mpi_errno = MPI_SUCCESS;
    int pmi_errno;
    int len;
    char *kvsname;
    MPIR_Group *prev_failed_group, *new_failed_group;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    /* FIXME: Currently this only handles failed processes in
       comm_world.  We need to fix hydra to include the pgid along
       with the rank, then we need to create the failed group from
       something bigger than comm_world. */
    mpi_errno = MPIDI_PG_GetConnKVSname(&kvsname);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
#ifdef USE_PMI2_API
    {
        int vallen = 0;
        pmi_errno = PMI2_KVS_Get(kvsname, PMI2_ID_NULL, "PMI_dead_processes", MPIDI_failed_procs_string, PMI2_MAX_VALLEN, &vallen);
        MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
    }
#else
    pmi_errno = PMI_KVS_Get_value_length_max(&len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
    pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", MPIDI_failed_procs_string, len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
#endif

    if (*MPIDI_failed_procs_string == '\0') {
        /* there are no failed processes */
        MPIDI_Failed_procs_group = MPIR_Group_empty;
        goto fn_exit;
    }

    MPL_DBG_MSG_S(MPIDI_CH3_DBG_OTHER, TYPICAL, "Received proc fail notification: %s", MPIDI_failed_procs_string);

    /* save reference to previous group so we can identify new failures */
    prev_failed_group = MPIDI_Failed_procs_group;

    /* Parse the list of failed processes */
    MPIDI_CH3U_Get_failed_group(-2, &MPIDI_Failed_procs_group);

    /* get group of newly failed processes */
    mpi_errno = MPIR_Group_difference_impl(MPIDI_Failed_procs_group, prev_failed_group, &new_failed_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    if (new_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIDI_CH3I_Comm_handle_failed_procs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = terminate_failed_VCs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Group_release(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

    /* free prev group */
    if (prev_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIR_Group_release(prev_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
    return mpi_errno;

 fn_oom: /* out-of-memory handler for utarray operations */
    MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray");
 fn_fail:
    goto fn_exit;
}
int MPID_Comm_failed_bitarray(MPID_Comm *comm_ptr, uint32_t **bitarray, int acked)
{
    int mpi_errno = MPI_SUCCESS;
    int size, i;
    uint32_t bit;
    int *failed_procs, *group_procs;
    MPID_Group *failed_group, *comm_group;
    MPIU_CHKLMEM_DECL(2);
    MPIDI_STATE_DECL(MPID_STATE_COMM_FAILED_BITARRAY);

    MPIDI_FUNC_ENTER(MPID_STATE_COMM_FAILED_BITARRAY);

    /* TODO - Fix this for intercommunicators */
    size = comm_ptr->local_size;

    /* We can fit sizeof(uint32_t) * 8 ranks in one uint64_t so divide the
     * size by that */
    /* This buffer will be handed back to the calling function so we use a
     * "real" malloc here and expect the caller to free the buffer later. The
     * other buffers in this function are temporary and will be automatically
     * cleaned up at the end of the function. */
    *bitarray = (uint32_t *) MPIU_Malloc(sizeof(uint32_t) * (size / (sizeof(uint32_t) * 8)+1));
    if (!(*bitarray)) {
        fprintf(stderr, "Could not allocate space for bitarray\n");
        PMPI_Abort(MPI_COMM_WORLD, 1);
    }
    for (i = 0; i <= size/(sizeof(uint32_t)*8); i++) *bitarray[i] = 0;

    mpi_errno = MPIDI_CH3U_Check_for_failed_procs();
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    if (acked)
        MPIDI_CH3U_Get_failed_group(comm_ptr->dev.last_ack_rank, &failed_group);
    else
        MPIDI_CH3U_Get_failed_group(-2, &failed_group);

    if (failed_group == MPID_Group_empty) goto fn_exit;


    MPIU_CHKLMEM_MALLOC(group_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "group_procs");
    for (i = 0; i < failed_group->size; i++) group_procs[i] = i;
    MPIU_CHKLMEM_MALLOC(failed_procs, int *, sizeof(int)*failed_group->size, mpi_errno, "failed_procs");

    MPIR_Comm_group_impl(comm_ptr, &comm_group);

    MPIR_Group_translate_ranks_impl(failed_group, failed_group->size, group_procs, comm_group, failed_procs);

    /* The bits will actually be ordered in decending order rather than
     * ascending. This is purely for readability since it makes no practical
     * difference. So if the bits look like this:
     *
     * 10001100 01001000 00000000 00000001
     *
     * Then processes 1, 5, 6, 9, 12, and 32 have failed. */
    for (i = 0; i < failed_group->size; i++) {
        bit = 0x80000000;
        bit >>= failed_procs[i] % (sizeof(uint32_t) * 8);

        *bitarray[failed_procs[i] / (sizeof(uint32_t) * 8)] |= bit;
    }

    MPIR_Group_free_impl(comm_group);

  fn_exit:
    MPIU_CHKLMEM_FREEALL();
    MPIDI_FUNC_EXIT(MPID_STATE_COMM_FAILED_BITARRAY);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}