Example #1
0
static int getConnInfoKVS( int rank, char *buf, int bufsize, MPIDI_PG_t *pg )
{
#ifdef USE_PMI2_API
    char key[MPIDI_MAX_KVS_KEY_LEN];
    int  mpi_errno = MPI_SUCCESS, rc;
    int vallen;

    rc = MPL_snprintf(key, MPIDI_MAX_KVS_KEY_LEN, "P%d-businesscard", rank );
    if (rc < 0 || rc > MPIDI_MAX_KVS_KEY_LEN) {
	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
    }

    mpi_errno = PMI2_KVS_Get(pg->connData, PMI2_ID_NULL, key, buf, bufsize, &vallen);
    if (mpi_errno) {
	MPIDI_PG_CheckForSingleton();
	mpi_errno = PMI2_KVS_Get(pg->connData, PMI2_ID_NULL, key, buf, bufsize, &vallen);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }
 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
#else
    char key[MPIDI_MAX_KVS_KEY_LEN];
    int  mpi_errno = MPI_SUCCESS, rc, pmi_errno;

    rc = MPL_snprintf(key, MPIDI_MAX_KVS_KEY_LEN, "P%d-businesscard", rank );
    if (rc < 0 || rc > MPIDI_MAX_KVS_KEY_LEN) {
	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**nomem");
    }

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
    pmi_errno = PMI_KVS_Get(pg->connData, key, buf, bufsize );
    if (pmi_errno) {
	MPIDI_PG_CheckForSingleton();
	pmi_errno = PMI_KVS_Get(pg->connData, key, buf, bufsize );
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
    if (pmi_errno) {
	MPIR_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**pmi_kvs_get");
    }

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
#endif
}
Example #2
0
static int test_item6(void)
{
    int rc = 0;
    char val[PMI2_MAX_VALLEN];
    int len;
    const char *tkey = __FUNCTION__;
    const char *tval = __FILE__;

    if (PMI2_SUCCESS != (rc = PMI2_KVS_Put(tkey, tval))) {
        log_fatal("PMI2_KVS_Put %d\n", rc);
        return rc;
    }

    if (PMI2_SUCCESS != (rc = PMI2_KVS_Get(NULL, PMI2_ID_NULL, tkey, val, sizeof(val), &len))) {
        log_fatal("PMI2_KVS_Get %d\n", rc);
        return rc;
    }

    log_info("tkey=%s tval=%s val=%s len=%d\n", tkey, tval, val, len);

    log_assert((int)strlen(tval) == len, "value does not meet expectation");
    log_assert(!strcmp(tval, val), "value does not meet expectation");

    return rc;
}
Example #3
0
void pmi_get(int pe, char *key, void *value, size_t valuelen) {
    int len;

    snprintf(kvs_key, max_key_len, "ofi-%lu-%s", (long unsigned) pe, key);
    int res = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL,
                           kvs_key, kvs_value, max_val_len, &len);
    assert(res == PMI2_SUCCESS);
    res = decode(kvs_value, value, valuelen);
    assert(res == 0);
}
static int kvs_get(const char *key, char *value, int valuelen)
{
#if WANT_CRAY_PMI2_EXT
    int len;

    return PMI2_KVS_Get(pmi_kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
#else
    return PMI_KVS_Get(pmi_kvs_name, key, value, valuelen);
#endif
}
Example #5
0
static int kvs_get(const char key[], char value [], int maxvalue)
{
    int rc;
    int len;
    rc = PMI2_KVS_Get(pmix_kvs_name, PMI2_ID_NULL, key, value, maxvalue, &len);
    if( PMI2_SUCCESS != rc || len < 0){
        OPAL_PMI_ERROR(rc, "PMI2_KVS_Get");
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}
Example #6
0
void do_kvs_get(void *value, size_t sz) {
    int rc;
#if USE_PMI2_API
    int len;
    rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, kvs_key, kvs_value, max_val_len, &len);
    gasneti_assert(PMI2_SUCCESS == rc);
#else
    rc = PMI_KVS_Get(kvs_name, kvs_key, kvs_value, max_val_len);
    gasneti_assert(PMI_SUCCESS == rc);
#endif
    do_decode(value, sz);
}
Example #7
0
int MPIDI_check_for_failed_procs(void)
{
    int mpi_errno = MPI_SUCCESS;
    int pmi_errno;
    int len;
    char *kvsname = MPIDI_global.jobid;
    char *failed_procs_string = NULL;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CHECK_FOR_FAILED_PROCS);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CHECK_FOR_FAILED_PROCS);

    /* FIXME: Currently this only handles failed processes in
     * comm_world.  We need to fix hydra to include the pgid along
     * with the rank, then we need to create the failed group from
     * something bigger than comm_world. */
#ifdef USE_PMIX_API
    MPIR_Assert(0);
#elif defined(USE_PMI2_API)
    {
        int vallen = 0;
        len = PMI2_MAX_VALLEN;
        failed_procs_string = MPL_malloc(len, MPL_MEM_OTHER);
        MPIR_Assert(failed_procs_string);
        pmi_errno =
            PMI2_KVS_Get(kvsname, PMI2_ID_NULL, "PMI_dead_processes", failed_procs_string,
                         len, &vallen);
        MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
        MPL_free(failed_procs_string);
    }
#else
    pmi_errno = PMI_KVS_Get_value_length_max(&len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
    failed_procs_string = MPL_malloc(len, MPL_MEM_OTHER);
    MPIR_Assert(failed_procs_string);
    pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", failed_procs_string, len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
    MPL_free(failed_procs_string);
#endif

    MPL_DBG_MSG_FMT(MPIDI_CH4_DBG_GENERAL, VERBOSE,
                    (MPL_DBG_FDEST, "Received proc fail notification: %s", failed_procs_string));

    /* FIXME: handle ULFM failed groups here */

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CHECK_FOR_FAILED_PROCS);
    return mpi_errno;
  fn_fail:
    MPL_free(failed_procs_string);
    goto fn_exit;
}
Example #8
0
static int kvs_get(const char key[], char value [], int maxvalue)
{
    int rc;
    int len;
    rc = PMI2_KVS_Get(pmix_kvs_name, PMI2_ID_NULL, key, value, maxvalue, &len);
    /*
     * turns out the KVS can be called for keys that haven't yet
     * been inserted, so suppress warning message if this is the
     * case
     */
    if (PMI2_SUCCESS != rc) {
        return OPAL_ERROR;
    }
    return OPAL_SUCCESS;
}
Example #9
0
File: pmi2.c Project: bosilca/ompi
double pmi_get_double(int rank, char *key)
{
    int len, rc;
    size_t tmp_size;
    char *tmp = calloc(PMI2_MAX_VALLEN, sizeof(char));
    double v;

    if( (rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, key, tmp, PMI2_MAX_VALLEN, &len) ) ){
        fprintf(stderr,"PMI2_Info_GetNodeAttr: error rc = %d\n", rc);
        abort();
    }
    sscanf(tmp, "%lf", &v);
    free(tmp);
    return v;
}
Example #10
0
int
shmem_runtime_get(int pe, char *key, void *value, size_t valuelen)
{
    int len;

    snprintf(kvs_key, max_key_len, "shmem-%lu-%s", (long unsigned) pe, key);
    if (PMI2_SUCCESS != PMI2_KVS_Get(kvs_name, PMI2_ID_NULL,
                                     kvs_key, kvs_value, max_val_len, &len)) {
        return 1;
    }
    if (0 != decode(kvs_value, value, valuelen)) {
        return 2;
    }

    return 0;
}
int MPIDI_CH3_GetParentPort(char ** parent_port)
{
    int mpi_errno = MPI_SUCCESS;
    int pmi_errno;
    char val[MPIDI_MAX_KVS_VALUE_LEN];

    if (parent_port_name == NULL)
    {
	char *kvsname = NULL;
	/* We can always use PMI_KVS_Get on our own process group */
	MPIDI_PG_GetConnKVSname( &kvsname );
#ifdef USE_PMI2_API
        {
            int vallen = 0;
            MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
            pmi_errno = PMI2_KVS_Get(kvsname, PMI2_ID_NULL, PARENT_PORT_KVSKEY, val, sizeof(val), &vallen);
            MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
            if (pmi_errno)
                MPIR_ERR_SETANDJUMP1(mpi_errno, MPI_ERR_OTHER, "**pmi_kvsget", "**pmi_kvsget %s", PARENT_PORT_KVSKEY);
        }
#else
	MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
	pmi_errno = PMI_KVS_Get( kvsname, PARENT_PORT_KVSKEY, val, sizeof(val));
	MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_POBJ_PMI_MUTEX);
	if (pmi_errno) {
            mpi_errno = MPIR_Err_create_code(MPI_SUCCESS, MPIR_ERR_FATAL, FCNAME, __LINE__, MPI_ERR_OTHER, "**pmi_kvsget", "**pmi_kvsget %d", pmi_errno);
            goto fn_exit;
	}
#endif
	parent_port_name = MPL_strdup(val);
	if (parent_port_name == NULL) {
	    MPIR_ERR_POP(mpi_errno); /* FIXME DARIUS */
	}
    }

    *parent_port = parent_port_name;

 fn_exit:
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Example #12
0
File: pmi2.c Project: bosilca/ompi
void pmi_get_key_rem(int rank, char *key_name, int **key_val, int *key_size)
{
    int len, rc;
    size_t tmp_size;

    char *tmp = calloc(PMI2_MAX_VALLEN, sizeof(char));
    if( (rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, key_name, tmp, PMI2_MAX_VALLEN, &len) ) ){
        fprintf(stderr,"PMI2_Info_GetNodeAttr: error rc = %d\n", rc);
        abort();
    }

    *key_val = (int*)pmi_decode(tmp, &tmp_size);
    *key_size = tmp_size / sizeof(int);

    if( NULL == *key_val ){
        fprintf(stderr,"pmi_decode: cannot decode key %s\n", key_name);
        abort();
    }
    free(tmp);
}
Example #13
0
static int test_item8(void)
{
    int rc = 0;
    int len;
    char tkey[PMI2_MAX_VALLEN];
    char tval[PMI2_MAX_VALLEN];
    char val[PMI2_MAX_VALLEN];
    int i = 0;

    for (i = 0; i < size; i++) {
        sprintf(tkey, "KEY-%d", i);
        sprintf(tval, "VALUE-%d", i);
        if (i == rank) {
            if (PMI2_SUCCESS != (rc = PMI2_KVS_Put(tkey, tval))) {
                log_fatal("PMI2_KVS_Put [%s=%s] %d\n", tkey, tval, rc);
                return rc;
            }
        }

        if (PMI2_SUCCESS != (rc = PMI2_KVS_Fence())) {
            log_fatal("PMI2_KVS_Fence %d\n", rc);
            return rc;
        }

        if (PMI2_SUCCESS != (rc = PMI2_KVS_Get(jobid, PMI2_ID_NULL, tkey, val, sizeof(val), &len))) {
            log_fatal("PMI2_KVS_Get [%s=?] %d\n", tkey, rc);
            return rc;
        }

        log_info("tkey=%s tval=%s val=%s len=%d\n", tkey, tval, val, len);

        log_assert((int)strlen(tval) == len, "value does not meet expectation");
        log_assert(!strcmp(tval, val), "value does not meet expectation");
    }

    return rc;
}
Example #14
0
int mca_common_pmi_get(const char *kvs_name, const char *key,
                       char *value, int valuelen)
{
    int rc;
#if WANT_PMI2_SUPPORT
    if( mca_common_pmi_version == 2 ){
        int len;
        rc = PMI2_KVS_Get(kvs_name, PMI2_ID_NULL, key, value, valuelen, &len);
        if( PMI2_SUCCESS != rc ){
            // OPAL_PMI2_ERROR(rc, "PMI_KVS_Put");
            return OPAL_ERROR;
        }
    }
    else
#endif
    {
        rc = PMI_KVS_Get(kvs_name, key, value, valuelen);
        if( PMI_SUCCESS != rc ){
            OPAL_PMI_ERROR(rc, "PMI_KVS_Put");
            return OPAL_ERROR;
        }
    }
    return OPAL_SUCCESS;
}
Example #15
0
static int test_item9(void)
{
    int rc = 0;
    int i, j, r;
    char symb, symb_start = 'a';
    int fence_cnt;
    int fence_num = random_value(2, 10);
    int keys_per_fence = random_value(10, 100);
    int val_size = random_value(10, PMI2_MAX_VALLEN / 10);
    int keys_total = 0;

    fence_cnt = 0;
    while (fence_cnt < fence_num) {
        log_info("fence_cnt=%d of fence_num=%d keys_per_fence=%d keys_total=%d val_size=%d\n",
                fence_cnt, fence_num, keys_per_fence, keys_total, val_size);
        symb = symb_start;
        for (i = 0; i < keys_per_fence; i++) {
            char key[PMI2_MAX_KEYLEN];
            char val[PMI2_MAX_VALLEN] = "";
            sprintf(key, "RANK%d-key-%d", rank, i + keys_total);
            for (j = 0; j < val_size; j++) {
                val[j] = symb;
            }
            symb++;
            if (symb > 'z') {
                symb = 'a';
            }
            if (PMI2_SUCCESS != (rc = PMI2_KVS_Put(key, val))) {
                log_fatal("PMI2_KVS_Put [%s=%s] %d\n", key, val, rc);
                return rc;
            }
            log_info("PMI2_KVS_Put [rank=%d %s] %d\n", rank, key, rc);
        }
        symb_start = symb;
        keys_total += keys_per_fence;

        if (PMI2_SUCCESS != (rc = PMI2_KVS_Fence())) {
            log_fatal("PMI2_KVS_Fence %d\n", rc);
            return rc;
        }

        for (r = 0; r < size; r++) {
            int len;
            symb = 'a';
            for (i = 0; i < keys_total; i++) {
                char key[PMI2_MAX_KEYLEN];
                char val[PMI2_MAX_VALLEN] = "";
                sprintf(key, "RANK%d-key-%d", r, i);

                if (PMI2_SUCCESS != (rc = PMI2_KVS_Get(jobid, r, key, val, sizeof(val), &len))) {
                    log_fatal("PMI2_KVS_Get [%s=?] %d\n", key, rc);
                    return rc;
                }

                log_info("PMI2_KVS_Get [rank=%d %s] %d\n", rank, key, rc);

                if (len != val_size) {
                    log_fatal("%d: failure on rank %d, key #%d: len mismatch:"
                            " %d instead of %d\n", rank, r, i, len, val_size);
                }

                for (j = 0; j < val_size; j++) {
                    if (val[j] != symb) {
                        log_fatal("%d: failure on rank %d, key #%d: value mismatch"
                                " at symb %d: \'%c\' instead of \'%c\'\n", rank,
                                r, i, j, val[j], symb);
                    }
                }
                symb++;
                if (symb > 'z') {
                    symb = 'a';
                }
            }
        }
        fence_cnt++;
    }

    return rc;
}
Example #16
0
int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len, int same_len,
                          int roots_only, void **bc_table, size_t ** bc_indices)
{
    int rc, mpi_errno = MPI_SUCCESS;
    int start, end, i;
    int out_len, val_len, rem;
    char *key = NULL, *val = NULL, *val_p;
    int local_rank, local_leader;
    size_t my_bc_len = bc_len;

    MPIR_NODEMAP_get_local_info(rank, size, nodemap, &local_size, &local_rank, &local_leader);

    /* if business cards can be different length, use the max value length */
    if (!same_len)
        bc_len = PMI2_MAX_VALLEN;
    mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    mpi_errno =
        MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank,
                             MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (size == 1) {
        memcpy(segment, bc, my_bc_len);
        goto single;
    }

    val = MPL_malloc(PMI2_MAX_VALLEN, MPL_MEM_ADDRESS);
    memset(val, 0, PMI2_MAX_VALLEN);
    val_p = val;
    rem = PMI2_MAX_VALLEN;
    rc = MPL_str_add_binary_arg(&val_p, &rem, "mpi", (char *) bc, my_bc_len);
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**buscard");
    MPIR_Assert(rem >= 0);

    key = MPL_malloc(PMI2_MAX_KEYLEN, MPL_MEM_ADDRESS);
    MPIR_Assert(key);
    if (!roots_only || rank == local_leader) {
        sprintf(key, "bc-%d", rank);
        rc = PMI2_KVS_Put(key, val);
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmi_kvsput");
    }
    rc = PMI2_KVS_Fence();
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmi_kvsfence");

    if (!roots_only) {
        start = local_rank * (size / local_size);
        end = start + (size / local_size);
        if (local_rank == local_size - 1)
            end += size % local_size;
        for (i = start; i < end; i++) {
            sprintf(key, "bc-%d", i);
            rc = PMI2_KVS_Get(NULL, -1, key, val, PMI2_MAX_VALLEN, &val_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmi_kvsget");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
        }
    } else {
        int num_nodes, *node_roots;
        MPIR_NODEMAP_get_node_roots(nodemap, size, &node_roots, &num_nodes);

        start = local_rank * (num_nodes / local_size);
        end = start + (num_nodes / local_size);
        if (local_rank == local_size - 1)
            end += num_nodes % local_size;
        for (i = start; i < end; i++) {
            sprintf(key, "bc-%d", node_roots[i]);
            rc = PMI2_KVS_Get(NULL, -1, key, val, PMI2_MAX_VALLEN, &val_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmi_kvsget");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
        }
        MPL_free(node_roots);
    }
    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

  single:
    if (!same_len) {
        indices = MPL_malloc(size * sizeof(size_t), MPL_MEM_ADDRESS);
        for (i = 0; i < size; i++)
            indices[i] = bc_len * i;
        *bc_indices = indices;
    }

  fn_exit:
    MPL_free(key);
    MPL_free(val);
    *bc_table = segment;

    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Example #17
0
int
main(int argc, char **argv)
{
	int rank;
	int size;
	int appnum;
	int spawned;
	int flag;
	int len;
	int i;
	struct timeval tv;
	struct timeval tv2;
	char jobid[128];
	char key[128];
	char val[128];
	char buf[128];

	{
		int x = 1;
		while (x == 0) {
			sleep(2);
		}
	}

	gettimeofday(&tv, NULL);
	srand(tv.tv_sec);

	PMI2_Init(&spawned, &size, &rank, &appnum);

	PMI2_Job_GetId(jobid, sizeof(buf));

	memset(val, 0, sizeof(val));
	PMI2_Info_GetJobAttr("mpi_reserved_ports",
			     val,
			     PMI2_MAX_ATTRVALUE,
			     &flag);

	sprintf(key, "mpi_reserved_ports");
	PMI2_KVS_Put(key, val);

	memset(val, 0, sizeof(val));
	sprintf(buf, "PMI_netinfo_of_task");
	PMI2_Info_GetJobAttr(buf,
			     val,
			     PMI2_MAX_ATTRVALUE,
			     &flag);
	sprintf(key, buf);
	PMI2_KVS_Put(key, val);

	memset(val, 0, sizeof(val));
	sprintf(key, "david@%d", rank);
	sprintf(val, "%s", mrand(97, 122));
	PMI2_KVS_Put(key, val);

	PMI2_KVS_Fence();

	for (i = 0; i < size; i++) {

		memset(val, 0, sizeof(val));
		sprintf(key, "PMI_netinfo_of_task");
		PMI2_KVS_Get(jobid,
			     PMI2_ID_NULL,
			     key,
			     val,
			     sizeof(val),
			     &len);
		printf("rank: %d key:%s val:%s\n", rank, key, val);

		memset(val, 0, sizeof(val));
		sprintf(key, "david@%d", rank);
		PMI2_KVS_Get(jobid,
			     PMI2_ID_NULL,
			     key,
			     val,
			     sizeof(val),
			     &len);
		printf("rank: %d key:%s val:%s\n", rank, key, val);

		memset(val, 0, sizeof(val));
		sprintf(key, "mpi_reserved_ports");
		PMI2_KVS_Get(jobid,
			     PMI2_ID_NULL,
			     key,
			     val,
			     sizeof(val),
			     &len);
		printf("rank: %d key:%s val:%s\n", rank, key, val);
	}

	PMI2_Finalize();

	gettimeofday(&tv2, NULL);
	printf("%f\n",
	       ((tv2.tv_sec - tv.tv_sec) * 1000.0
		+ (tv2.tv_usec - tv.tv_usec) / 1000.0));

	return 0;
}
Example #18
0
int MPIDI_CH3U_Check_for_failed_procs(void)
{
    int mpi_errno = MPI_SUCCESS;
    int pmi_errno;
    int len;
    char *kvsname;
    MPIR_Group *prev_failed_group, *new_failed_group;
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);

    /* FIXME: Currently this only handles failed processes in
       comm_world.  We need to fix hydra to include the pgid along
       with the rank, then we need to create the failed group from
       something bigger than comm_world. */
    mpi_errno = MPIDI_PG_GetConnKVSname(&kvsname);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
#ifdef USE_PMI2_API
    {
        int vallen = 0;
        pmi_errno = PMI2_KVS_Get(kvsname, PMI2_ID_NULL, "PMI_dead_processes", MPIDI_failed_procs_string, PMI2_MAX_VALLEN, &vallen);
        MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
    }
#else
    pmi_errno = PMI_KVS_Get_value_length_max(&len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get_value_length_max");
    pmi_errno = PMI_KVS_Get(kvsname, "PMI_dead_processes", MPIDI_failed_procs_string, len);
    MPIR_ERR_CHKANDJUMP(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**pmi_kvs_get");
#endif

    if (*MPIDI_failed_procs_string == '\0') {
        /* there are no failed processes */
        MPIDI_Failed_procs_group = MPIR_Group_empty;
        goto fn_exit;
    }

    MPL_DBG_MSG_S(MPIDI_CH3_DBG_OTHER, TYPICAL, "Received proc fail notification: %s", MPIDI_failed_procs_string);

    /* save reference to previous group so we can identify new failures */
    prev_failed_group = MPIDI_Failed_procs_group;

    /* Parse the list of failed processes */
    MPIDI_CH3U_Get_failed_group(-2, &MPIDI_Failed_procs_group);

    /* get group of newly failed processes */
    mpi_errno = MPIR_Group_difference_impl(MPIDI_Failed_procs_group, prev_failed_group, &new_failed_group);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    if (new_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIDI_CH3I_Comm_handle_failed_procs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = terminate_failed_VCs(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Group_release(new_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

    /* free prev group */
    if (prev_failed_group != MPIR_Group_empty) {
        mpi_errno = MPIR_Group_release(prev_failed_group);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

 fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3U_CHECK_FOR_FAILED_PROCS);
    return mpi_errno;

 fn_oom: /* out-of-memory handler for utarray operations */
    MPIR_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "utarray");
 fn_fail:
    goto fn_exit;
}