Example #1
0
int MPID_nem_ptl_poll_init(void)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    MPIU_CHKPMEM_DECL(NUM_OVERFLOW_ME);
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_POLL_INIT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_POLL_INIT);

    /* create overflow buffers */
    for (i = 0; i < NUM_OVERFLOW_ME; ++i) {
        MPIU_CHKPMEM_MALLOC(overflow_buf[i], void *, OVERFLOW_LENGTH, mpi_errno, "overflow buffer");
        mpi_errno = append_overflow(i);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }
    
 fn_exit:
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_POLL_INIT);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
}
Example #2
0
int MPIDI_VCRT_Create(int size, struct MPIDI_VCRT **vcrt_ptr)
{
    MPIDI_VCRT_t * vcrt;
    int mpi_errno = MPI_SUCCESS;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_VCRT_CREATE);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_VCRT_CREATE);

    MPIU_CHKPMEM_MALLOC(vcrt, MPIDI_VCRT_t *, sizeof(MPIDI_VCRT_t) + (size - 1) * sizeof(MPIDI_VC_t *),	mpi_errno, "**nomem");
    vcrt->handle = HANDLE_SET_KIND(0, HANDLE_KIND_INVALID);
    MPIU_Object_set_ref(vcrt, 1);
    vcrt->size = size;
    *vcrt_ptr = vcrt;

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_VCRT_CREATE);
    return mpi_errno;
 fn_fail:
    /* --BEGIN ERROR HANDLING-- */
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Example #3
0
int MPIR_Comm_map_dup(MPID_Comm * newcomm, MPID_Comm * src_comm, MPIR_Comm_map_dir_t dir)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Comm_map_t *mapper;
    MPIU_CHKPMEM_DECL(1);
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_MAP_DUP);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_MAP_DUP);

    MPIU_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper");

    mapper->type = MPIR_COMM_MAP_DUP;
    mapper->src_comm = src_comm;
    mapper->dir = dir;

    mapper->next = NULL;

    MPL_LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper);

  fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_MAP_DUP);
    return mpi_errno;
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #4
0
int MPID_nem_scif_init(MPIDI_PG_t * pg_p, int pg_rank, char **bc_val_p, int *val_max_sz_p)
{
    int mpi_errno = MPI_SUCCESS;
    int ret;
    int i;
    MPIU_CHKPMEM_DECL(2);
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_SCIF_INIT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_SCIF_INIT);

    /* first make sure that our private fields in the vc fit into the
     * area provided  */
    MPIU_Assert(sizeof(MPID_nem_scif_vc_area) <= MPID_NEM_VC_NETMOD_AREA_LEN);

    MPID_nem_scif_nranks = pg_p->size;
    MPID_nem_scif_myrank = pg_rank;

    /* set up listener socket */
    if (MPID_nem_scif_myrank < MPID_nem_scif_nranks - 1) {
        listen_fd = scif_open();
        MPIU_ERR_CHKANDJUMP1(listen_fd == -1, mpi_errno, MPI_ERR_OTHER,
                             "**scif_open", "**scif_open %s", MPIU_Strerror(errno));

        listen_port = scif_bind(listen_fd, 0);
        MPIU_ERR_CHKANDJUMP1(listen_port == -1, mpi_errno, MPI_ERR_OTHER,
                             "**scif_bind", "**scif_bind %s", MPIU_Strerror(errno));

        ret = scif_listen(listen_fd, MPID_nem_scif_nranks);
        MPIU_ERR_CHKANDJUMP1(ret == -1, mpi_errno, MPI_ERR_OTHER,
                             "**scif_listen", "**scif_listen %s", MPIU_Strerror(errno));
    }

    /* create business card */
    mpi_errno = MPID_nem_scif_get_business_card(pg_rank, bc_val_p, val_max_sz_p);
    if (mpi_errno)
        MPIU_ERR_POP(mpi_errno);

    MPIU_CHKPMEM_MALLOC(MPID_nem_scif_conns, scifconn_t *,
                        MPID_nem_scif_nranks * sizeof(scifconn_t), mpi_errno,
                        "connection table");
    memset(MPID_nem_scif_conns, 0, MPID_nem_scif_nranks * sizeof(scifconn_t));
    for (i = 0; i < MPID_nem_scif_nranks; ++i)
        MPID_nem_scif_conns[i].fd = -1;

    MPIU_CHKPMEM_MALLOC(MPID_nem_scif_recv_buf, char *,
                        MPID_NEM_SCIF_RECV_MAX_PKT_LEN, mpi_errno, "SCIF temporary buffer");
    MPIU_CHKPMEM_COMMIT();

  fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_SCIF_INIT);
    return mpi_errno;
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #5
0
static int handle_mprobe(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const req = e->user_ptr;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_HANDLE_PROBE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLE_PROBE);

    if (e->ni_fail_type == PTL_NI_NO_MATCH) {
        REQ_PTL(req)->found = FALSE;
        goto finish_mprobe;
    }

    REQ_PTL(req)->found = TRUE;
    req->status.MPI_SOURCE = NPTL_MATCH_GET_RANK(e->match_bits);
    req->status.MPI_TAG = NPTL_MATCH_GET_TAG(e->match_bits);
    MPIR_STATUS_SET_COUNT(req->status, NPTL_HEADER_GET_LENGTH(e->hdr_data));
    MPIDI_Request_set_sync_send_flag(req, e->hdr_data & NPTL_SSEND);

    MPIU_CHKPMEM_MALLOC(req->dev.tmpbuf, void *, e->mlength, mpi_errno, "tmpbuf");
    MPIU_Memcpy((char *)req->dev.tmpbuf, e->start, e->mlength);
    req->dev.recv_data_sz = e->mlength;

    if (!(e->hdr_data & NPTL_LARGE)) {
        MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_EAGER_MSG);
    }
    else {
        MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD);
        req->dev.match.parts.tag = req->status.MPI_TAG;
        req->dev.match.parts.context_id = NPTL_MATCH_GET_CTX(e->match_bits);
        req->dev.match.parts.rank = req->status.MPI_SOURCE;
        MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG);
    }

    /* At this point we know the ME is unlinked. Invalidate the handle to
       prevent further accesses, e.g. an attempted cancel. */
    REQ_PTL(req)->put_me = PTL_INVALID_HANDLE;
    req->dev.recv_pending_count = 1;

  finish_mprobe:
    mpi_errno = MPID_Request_complete(req);
    if (mpi_errno != MPI_SUCCESS) {
        MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLE_PROBE);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #6
0
static int sched_get_cid_nonblock(MPID_Comm * comm_ptr, MPID_Comm * newcomm,
                                  MPIU_Context_id_t * ctx0, MPIU_Context_id_t * ctx1,
                                  MPID_Sched_t s, MPID_Comm_kind_t gcn_cid_kind)
{
    int mpi_errno = MPI_SUCCESS;
    struct gcn_state *st = NULL;
    MPIU_CHKPMEM_DECL(1);

    if (initialize_context_mask) {
        context_id_init();
    }

    MPIU_CHKPMEM_MALLOC(st, struct gcn_state *, sizeof(struct gcn_state), mpi_errno, "gcn_state");
    st->ctx0 = ctx0;
    st->ctx1 = ctx1;
    if (gcn_cid_kind == MPID_INTRACOMM) {
        st->comm_ptr = comm_ptr;
        st->comm_ptr_inter = NULL;
    }
    else {
        st->comm_ptr = comm_ptr->local_comm;
        st->comm_ptr_inter = comm_ptr;
    }
    st->s = s;
    st->gcn_cid_kind = gcn_cid_kind;
    *(st->ctx0) = 0;
    st->own_eager_mask = 0;
    st->first_iter = 1;
    st->new_comm = newcomm;
    st->own_mask = 0;
    if (eager_nelem < 0) {
        /* Ensure that at least one word of deadlock-free context IDs is
         * always set aside for the base protocol */
        MPIU_Assert(MPIR_CVAR_CTXID_EAGER_SIZE >= 0 &&
                    MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1);
        eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE;
    }
    mpi_errno = MPID_Sched_cb(&sched_cb_gcn_copy_mask, st, s);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPID_SCHED_BARRIER(s);

    MPIU_CHKPMEM_COMMIT();
  fn_exit:
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Example #7
0
static int handler_recv_dequeue_unpack_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    MPIDI_VC_t *vc;
    MPI_Aint last;
    void *buf;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);

    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);

    dequeue_req(e);

    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_unpack_complete(e);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }

    if (e->type == PTL_EVENT_PUT_OVERFLOW)
        buf = e->start;
    else
        buf = REQ_PTL(rreq)->chunk_buffer[0];

    MPIU_Assert(e->mlength == PTL_LARGE_THRESHOLD);
    last = PTL_LARGE_THRESHOLD;
    MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf);
    MPIU_Assert(last == PTL_LARGE_THRESHOLD);
    rreq->dev.segment_first += PTL_LARGE_THRESHOLD;
    MPIU_Free(REQ_PTL(rreq)->chunk_buffer[0]);

    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first,
                        mpi_errno, "chunk_buffer");
    big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq);

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
}
Example #8
0
int MPIR_T_pvar_session_create_impl(MPI_T_pvar_session *session)
{
    int mpi_errno = MPI_SUCCESS;
    MPIU_CHKPMEM_DECL(1);

    *session = MPI_T_PVAR_SESSION_NULL;

    MPIU_CHKPMEM_MALLOC(*session, MPI_T_pvar_session, sizeof(**session), mpi_errno, "performance var session");

    /* essential for utlist to work */
    (*session)->hlist = NULL;

    MPIU_CHKPMEM_COMMIT();
fn_exit:
    return mpi_errno;
fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #9
0
int MPIR_Comm_map_irregular(MPID_Comm * newcomm, MPID_Comm * src_comm,
                            int *src_mapping, int src_mapping_size,
                            MPIR_Comm_map_dir_t dir, MPIR_Comm_map_t ** map)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Comm_map_t *mapper;
    MPIU_CHKPMEM_DECL(3);
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_MAP_IRREGULAR);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_MAP_IRREGULAR);

    MPIU_CHKPMEM_MALLOC(mapper, MPIR_Comm_map_t *, sizeof(MPIR_Comm_map_t), mpi_errno, "mapper");

    mapper->type = MPIR_COMM_MAP_IRREGULAR;
    mapper->src_comm = src_comm;
    mapper->dir = dir;
    mapper->src_mapping_size = src_mapping_size;

    if (src_mapping) {
        mapper->src_mapping = src_mapping;
        mapper->free_mapping = 0;
    }
    else {
        MPIU_CHKPMEM_MALLOC(mapper->src_mapping, int *,
                            src_mapping_size * sizeof(int), mpi_errno, "mapper mapping");
        mapper->free_mapping = 1;
    }

    mapper->next = NULL;

    MPL_LL_APPEND(newcomm->mapper_head, newcomm->mapper_tail, mapper);

    if (map)
        *map = mapper;

  fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_MAP_IRREGULAR);
    return mpi_errno;
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #10
0
int MPIDI_CH3_Connect_to_root (const char *port_name, MPIDI_VC_t **new_vc)
{
    int mpi_errno = MPI_SUCCESS;
    MPIDI_VC_t * vc;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT);

    *new_vc = NULL; /* so that the err handling knows to cleanup */

    MPIU_CHKPMEM_MALLOC (vc, MPIDI_VC_t *, sizeof(MPIDI_VC_t), mpi_errno, "vc");
    /* FIXME - where does this vc get freed?
       ANSWER (goodell@) - ch3u_port.c FreeNewVC
                           (but the VC_Destroy is in this file) */

    /* init ch3 portion of vc */
    MPIDI_VC_Init (vc, NULL, 0);

    /* init channel portion of vc */
    MPIR_ERR_CHKINTERNAL(!nemesis_initialized, mpi_errno, "Nemesis not initialized");
    vc->ch.recv_active = NULL;
    MPIDI_CHANGE_VC_STATE(vc, ACTIVE);

    *new_vc = vc; /* we now have a valid, disconnected, temp VC */

    mpi_errno = MPID_nem_connect_to_root (port_name, vc);
    if (mpi_errno) MPIR_ERR_POP (mpi_errno);

    MPIU_CHKPMEM_COMMIT();
fn_exit:
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT);
    return mpi_errno;
fn_fail:
    /* freeing without giving the lower layer a chance to cleanup can lead to
       leaks on error */
    if (*new_vc)
        MPIDI_CH3_VC_Destroy(*new_vc);
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
int MPID_nem_lmt_dma_initiate_lmt(MPIDI_VC_t *vc, MPIDI_CH3_Pkt_t *pkt, MPID_Request *sreq)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_pkt_lmt_rts_t * const rts_pkt = (MPID_nem_pkt_lmt_rts_t *)pkt;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);
    
    MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);

    MPIU_CHKPMEM_MALLOC(sreq->ch.s_cookie, knem_cookie_t *, sizeof(knem_cookie_t), mpi_errno, "s_cookie");

    mpi_errno = send_sreq_data(vc, sreq, sreq->ch.s_cookie);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    MPID_nem_lmt_send_RTS(vc, rts_pkt, sreq->ch.s_cookie, sizeof(knem_cookie_t));

fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_LMT_DMA_INITIATE_LMT);
    return mpi_errno;
fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #12
0
int MPIDI_CH3U_Comm_register_create_hook(int (*hook_fn)(struct MPID_Comm *, void *), void *param)
{
    int mpi_errno = MPI_SUCCESS;
    hook_elt *elt;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_COMM_REGISTER_CREATE_HOOK);

    MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_COMM_REGISTER_CREATE_HOOK);

    MPIU_CHKPMEM_MALLOC(elt, hook_elt *, sizeof(hook_elt), mpi_errno, "hook_elt");

    elt->hook_fn = hook_fn;
    elt->param = param;
    
    MPL_LL_PREPEND(create_hooks_head, create_hooks_tail, elt);

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
    MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_COMM_REGISTER_CREATE_HOOK);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #13
0
int MPIR_Comm_create_calculate_mapping(MPID_Group  *group_ptr,
                                       MPID_Comm   *comm_ptr,
                                       int        **mapping_out,
                                       MPID_Comm **mapping_comm)
{
    int mpi_errno = MPI_SUCCESS;
    int subsetOfWorld = 0;
    int i, j;
    int n;
    int *mapping=0;
    MPIU_CHKPMEM_DECL(1);
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);

    *mapping_out = NULL;
    *mapping_comm = comm_ptr;

    n = group_ptr->size;
    MPIU_CHKPMEM_MALLOC(mapping,int*,n*sizeof(int),mpi_errno,"mapping");

    /* Make sure that the processes for this group are contained within
       the input communicator.  Also identify the mapping from the ranks of
       the old communicator to the new communicator.
       We do this by matching the lpids of the members of the group
       with the lpids of the members of the input communicator.
       It is an error if the group contains a reference to an lpid that
       does not exist in the communicator.

       An important special case is groups (and communicators) that
       are subsets of MPI_COMM_WORLD.  In this case, the lpids are
       exactly the same as the ranks in comm world.
    */

    /* we examine the group's lpids in both the intracomm and non-comm_world cases */
    MPIR_Group_setup_lpid_list( group_ptr );

    /* Optimize for groups contained within MPI_COMM_WORLD. */
    if (comm_ptr->comm_kind == MPID_INTRACOMM) {
        int wsize;
        subsetOfWorld = 1;
        wsize         = MPIR_Process.comm_world->local_size;
        for (i=0; i<n; i++) {
            int g_lpid = group_ptr->lrank_to_lpid[i].lpid;

            /* This mapping is relative to comm world */
            MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE,
                             (MPL_DBG_FDEST,
                              "comm-create - mapping into world[%d] = %d",
                              i, g_lpid ));
            if (g_lpid < wsize) {
                mapping[i] = g_lpid;
            }
            else {
                subsetOfWorld = 0;
                break;
            }
        }
    }
    MPL_DBG_MSG_D(MPIR_DBG_COMM,VERBOSE, "subsetOfWorld=%d", subsetOfWorld );
    if (subsetOfWorld) {
#           ifdef HAVE_ERROR_CHECKING
        {
            MPID_BEGIN_ERROR_CHECKS;
            {
                mpi_errno = MPIR_Group_check_subset( group_ptr, comm_ptr );
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);
            }
            MPID_END_ERROR_CHECKS;
        }
#           endif
        /* Override the comm to be used with the mapping array. */
        *mapping_comm = MPIR_Process.comm_world;
    }
    else {
        for (i=0; i<n; i++) {
            /* mapping[i] is the rank in the communicator of the process
               that is the ith element of the group */
            /* FIXME : BUBBLE SORT */
            mapping[i] = -1;
            for (j=0; j<comm_ptr->local_size; j++) {
                int comm_lpid;
                MPID_Comm_get_lpid( comm_ptr, j, &comm_lpid, FALSE );
                if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) {
                    mapping[i] = j;
                    break;
                }
            }
            MPIR_ERR_CHKANDJUMP1(mapping[i] == -1,mpi_errno,MPI_ERR_GROUP,
                                 "**groupnotincomm", "**groupnotincomm %d", i );
        }
    }

    MPIU_Assert(mapping != NULL);
    *mapping_out     = mapping;
    MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_out, n * sizeof(**mapping_out));

    MPIU_CHKPMEM_COMMIT();
fn_exit:
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);
    return mpi_errno;
fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #14
0
static int init_default_collops(void)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    struct MPID_Collops *ops = NULL;
    MPIU_CHKPMEM_DECL(MPID_HIERARCHY_SIZE + 1);

    /* first initialize the intracomms */
    for (i = 0; i < MPID_HIERARCHY_SIZE; ++i) {
        MPIU_CHKPMEM_CALLOC(ops, struct MPID_Collops *, sizeof(struct MPID_Collops), mpi_errno,
                            "default intracomm collops");
        ops->ref_count = 1;     /* force existence until finalize time */

        /* intracomm default defaults... */
        ops->Ibcast_sched = &MPIR_Ibcast_intra;
        ops->Ibarrier_sched = &MPIR_Ibarrier_intra;
        ops->Ireduce_sched = &MPIR_Ireduce_intra;
        ops->Ialltoall_sched = &MPIR_Ialltoall_intra;
        ops->Ialltoallv_sched = &MPIR_Ialltoallv_intra;
        ops->Ialltoallw_sched = &MPIR_Ialltoallw_intra;
        ops->Iallreduce_sched = &MPIR_Iallreduce_intra;
        ops->Igather_sched = &MPIR_Igather_intra;
        ops->Igatherv_sched = &MPIR_Igatherv;
        ops->Iscatter_sched = &MPIR_Iscatter_intra;
        ops->Iscatterv_sched = &MPIR_Iscatterv;
        ops->Ireduce_scatter_sched = &MPIR_Ireduce_scatter_intra;
        ops->Ireduce_scatter_block_sched = &MPIR_Ireduce_scatter_block_intra;
        ops->Iallgather_sched = &MPIR_Iallgather_intra;
        ops->Iallgatherv_sched = &MPIR_Iallgatherv_intra;
        ops->Iscan_sched = &MPIR_Iscan_rec_dbl;
        ops->Iexscan_sched = &MPIR_Iexscan;
        ops->Neighbor_allgather = &MPIR_Neighbor_allgather_default;
        ops->Neighbor_allgatherv = &MPIR_Neighbor_allgatherv_default;
        ops->Neighbor_alltoall = &MPIR_Neighbor_alltoall_default;
        ops->Neighbor_alltoallv = &MPIR_Neighbor_alltoallv_default;
        ops->Neighbor_alltoallw = &MPIR_Neighbor_alltoallw_default;
        ops->Ineighbor_allgather = &MPIR_Ineighbor_allgather_default;
        ops->Ineighbor_allgatherv = &MPIR_Ineighbor_allgatherv_default;
        ops->Ineighbor_alltoall = &MPIR_Ineighbor_alltoall_default;
        ops->Ineighbor_alltoallv = &MPIR_Ineighbor_alltoallv_default;
        ops->Ineighbor_alltoallw = &MPIR_Ineighbor_alltoallw_default;

        /* override defaults, such as for SMP */
        switch (i) {
        case MPID_HIERARCHY_FLAT:
            break;
        case MPID_HIERARCHY_PARENT:
            ops->Ibcast_sched = &MPIR_Ibcast_SMP;
            ops->Iscan_sched = &MPIR_Iscan_SMP;
            ops->Iallreduce_sched = &MPIR_Iallreduce_SMP;
            ops->Ireduce_sched = &MPIR_Ireduce_SMP;
            break;
        case MPID_HIERARCHY_NODE:
            break;
        case MPID_HIERARCHY_NODE_ROOTS:
            break;

            /* --BEGIN ERROR HANDLING-- */
        default:
            MPIU_Assertp(FALSE);
            break;
            /* --END ERROR HANDLING-- */
        }

        /* this is a default table, it's not overriding another table */
        ops->prev_coll_fns = NULL;

        default_collops[i] = ops;
    }

    /* now the intercomm table */
    {
        MPIU_CHKPMEM_CALLOC(ops, struct MPID_Collops *, sizeof(struct MPID_Collops), mpi_errno,
                            "default intercomm collops");
        ops->ref_count = 1;     /* force existence until finalize time */

        /* intercomm defaults */
        ops->Ibcast_sched = &MPIR_Ibcast_inter;
        ops->Ibarrier_sched = &MPIR_Ibarrier_inter;
        ops->Ireduce_sched = &MPIR_Ireduce_inter;
        ops->Ialltoall_sched = &MPIR_Ialltoall_inter;
        ops->Ialltoallv_sched = &MPIR_Ialltoallv_inter;
        ops->Ialltoallw_sched = &MPIR_Ialltoallw_inter;
        ops->Iallreduce_sched = &MPIR_Iallreduce_inter;
        ops->Igather_sched = &MPIR_Igather_inter;
        ops->Igatherv_sched = &MPIR_Igatherv;
        ops->Iscatter_sched = &MPIR_Iscatter_inter;
        ops->Iscatterv_sched = &MPIR_Iscatterv;
        ops->Ireduce_scatter_sched = &MPIR_Ireduce_scatter_inter;
        ops->Ireduce_scatter_block_sched = &MPIR_Ireduce_scatter_block_inter;
        ops->Iallgather_sched = &MPIR_Iallgather_inter;
        ops->Iallgatherv_sched = &MPIR_Iallgatherv_inter;
        /* scan and exscan are not valid for intercommunicators, leave them NULL */
        /* Ineighbor_all* routines are not valid for intercommunicators, leave
         * them NULL */

        /* this is a default table, it's not overriding another table */
        ops->prev_coll_fns = NULL;

        ic_default_collops = ops;
    }


    /* run after MPID_Finalize to permit collective usage during finalize */
    MPIR_Add_finalize(cleanup_default_collops, NULL, MPIR_FINALIZE_CALLBACK_PRIO - 1);

    MPIU_CHKPMEM_COMMIT();
  fn_exit:
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Example #15
0
PMPI_LOCAL int MPIR_Comm_create_calculate_mapping(MPID_Group  *group_ptr,
                                                  MPID_Comm   *comm_ptr,
                                                  MPID_VCR   **mapping_vcr_out,
                                                  int        **mapping_out)
{
    int mpi_errno = MPI_SUCCESS;
    int subsetOfWorld = 0;
    int i, j;
    int n;
    int *mapping=0;
    int vcr_size;
    MPID_VCR *vcr;
    MPIU_CHKPMEM_DECL(1);
    MPID_MPI_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);

    MPID_MPI_FUNC_ENTER(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);

    *mapping_out = NULL;
    *mapping_vcr_out = NULL;

    /* N.B. For intracomms only the comm_ptr->vcr is valid and populated,
     * however local_size and remote_size are always set to the same value for
     * intracomms.  For intercomms both are valid and populated, with the
     * local_vcr holding VCs corresponding to the local_group, local_comm, and
     * local_size.
     *
     * For this mapping calculation we always want the logically local vcr,
     * regardless of whether it is stored in the "plain" vcr or local_vcr. */
    if (comm_ptr->comm_kind == MPID_INTERCOMM) {
        vcr      = comm_ptr->local_vcr;
        vcr_size = comm_ptr->local_size;
    }
    else {
        vcr      = comm_ptr->vcr;
        vcr_size = comm_ptr->remote_size;
    }

    n = group_ptr->size;
    MPIU_CHKPMEM_MALLOC(mapping,int*,n*sizeof(int),mpi_errno,"mapping");

    /* Make sure that the processes for this group are contained within
       the input communicator.  Also identify the mapping from the ranks of
       the old communicator to the new communicator.
       We do this by matching the lpids of the members of the group
       with the lpids of the members of the input communicator.
       It is an error if the group contains a reference to an lpid that
       does not exist in the communicator.

       An important special case is groups (and communicators) that
       are subsets of MPI_COMM_WORLD.  In this case, the lpids are
       exactly the same as the ranks in comm world.
    */

    /* we examine the group's lpids in both the intracomm and non-comm_world cases */
    MPIR_Group_setup_lpid_list( group_ptr );

    /* Optimize for groups contained within MPI_COMM_WORLD. */
    if (comm_ptr->comm_kind == MPID_INTRACOMM) {
        int wsize;
        subsetOfWorld = 1;
        wsize         = MPIR_Process.comm_world->local_size;
        for (i=0; i<n; i++) {
            int g_lpid = group_ptr->lrank_to_lpid[i].lpid;

            /* This mapping is relative to comm world */
            MPIU_DBG_MSG_FMT(COMM,VERBOSE,
                             (MPIU_DBG_FDEST,
                              "comm-create - mapping into world[%d] = %d\n",
                              i, g_lpid ));
            if (g_lpid < wsize) {
                mapping[i] = g_lpid;
            }
            else {
                subsetOfWorld = 0;
                break;
            }
        }
    }
    MPIU_DBG_MSG_D(COMM,VERBOSE, "subsetOfWorld=%d", subsetOfWorld );
    if (subsetOfWorld) {
#           ifdef HAVE_ERROR_CHECKING
        {
            MPID_BEGIN_ERROR_CHECKS;
            {
                int idx;
                mpi_errno = MPIR_GroupCheckVCRSubset( group_ptr, vcr_size, vcr, &idx );
                if (mpi_errno) MPIU_ERR_POP(mpi_errno);
            }
            MPID_END_ERROR_CHECKS;
        }
#           endif
        /* Override the vcr to be used with the mapping array. */
        vcr = MPIR_Process.comm_world->vcr;
        vcr_size = MPIR_Process.comm_world->local_size;
    }
    else {
        for (i=0; i<n; i++) {
            /* mapping[i] is the rank in the communicator of the process
               that is the ith element of the group */
            /* FIXME : BUBBLE SORT */
            mapping[i] = -1;
            for (j=0; j<vcr_size; j++) {
                int comm_lpid;
                MPID_VCR_Get_lpid( vcr[j], &comm_lpid );
                if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) {
                    mapping[i] = j;
                    break;
                }
            }
            MPIU_ERR_CHKANDJUMP1(mapping[i] == -1,mpi_errno,MPI_ERR_GROUP,
                                 "**groupnotincomm", "**groupnotincomm %d", i );
        }
    }

    MPIU_Assert(vcr != NULL);
    MPIU_Assert(mapping != NULL);
    *mapping_vcr_out = vcr;
    *mapping_out     = mapping;
    MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_vcr_out, vcr_size * sizeof(**mapping_vcr_out));
    MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_out, n * sizeof(**mapping_out));

    MPIU_CHKPMEM_COMMIT();
fn_exit:
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING);
    return mpi_errno;
fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit;
}
Example #16
0
static int handler_recv_dequeue_large(const ptl_event_t *e)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Request *const rreq = e->user_ptr;
    MPIDI_VC_t *vc;
    MPID_nem_ptl_vc_area *vc_ptl;
    int ret;
    int dt_contig;
    MPIDI_msg_sz_t data_sz;
    MPID_Datatype *dt_ptr;
    MPI_Aint dt_true_lb;
    MPI_Aint last;
    MPIU_CHKPMEM_DECL(1);
    MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);

    MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
    
    MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW);

    MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc);
    vc_ptl = VC_PTL(vc);
    
    dequeue_req(e);

    MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb);

    /* unpack data from unexpected buffer first */
    if (e->type == PTL_EVENT_PUT_OVERFLOW) {
        if (dt_contig) {
            MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength);
        } else {
            last = e->mlength;
            MPID_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start);
            MPIU_Assert(last == e->mlength);
            rreq->dev.segment_first = e->mlength;
        }
    }
    
    if (!(e->hdr_data & NPTL_LARGE)) {
        /* all data has already been received; we're done */
        mpi_errno = handler_recv_complete(e);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        goto fn_exit;
    }
        
    MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD);

    /* we need to GET the rest of the data from the sender's buffer */
    if (dt_contig) {
        big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD,
                vc, e->match_bits, rreq);
        goto fn_exit;
    }

    /* noncontig recv buffer */
    
    last = rreq->dev.segment_size;
    rreq->dev.iov_count = MPL_IOV_LIMIT;
    MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count);

    if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) {
        /* Rest of message fits in one IOV */
        ptl_md_t md;

        md.start = rreq->dev.iov;
        md.length = rreq->dev.iov_count;
        md.options = PTL_IOVEC;
        md.eq_handle = MPIDI_nem_ptl_origin_eq;
        md.ct_handle = PTL_CT_NONE;
        ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md);
        MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret));

        REQ_PTL(rreq)->event_handler = handler_recv_complete;
        ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg,
                     e->match_bits, 0, rreq);
        MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret));
        goto fn_exit;
    }
        
    /* message won't fit in a single IOV, allocate buffer and unpack when received */
    /* FIXME: For now, allocate a single large buffer to hold entire message */
    MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD,
                        mpi_errno, "chunk_buffer");
    big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq);

 fn_exit:
    MPIU_CHKPMEM_COMMIT();
 fn_exit2:
    MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE);
    return mpi_errno;
 fn_fail:
    MPIU_CHKPMEM_REAP();
    goto fn_exit2;
}
Example #17
0
/*@
MPI_Dist_graph_create - MPI_DIST_GRAPH_CREATE returns a handle to a new
communicator to which the distributed graph topology information is
attached.

Input Parameters:
+ comm_old - input communicator (handle)
. n - number of source nodes for which this process specifies edges 
  (non-negative integer)
. sources - array containing the n source nodes for which this process 
  specifies edges (array of non-negative integers)
. degrees - array specifying the number of destinations for each source node 
  in the source node array (array of non-negative integers)
. destinations - destination nodes for the source nodes in the source node 
  array (array of non-negative integers)
. weights - weights for source to destination edges (array of non-negative 
  integers or MPI_UNWEIGHTED)
. info - hints on optimization and interpretation of weights (handle)
- reorder - the process may be reordered (true) or not (false) (logical)

Output Parameters:
. comm_dist_graph - communicator with distributed graph topology added (handle)

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_ARG
.N MPI_ERR_OTHER
@*/
int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[],
                          const int degrees[], const int destinations[],
                          const int weights[],
                          MPI_Info info, int reorder, MPI_Comm *comm_dist_graph)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Comm *comm_ptr = NULL;
    MPID_Comm *comm_dist_graph_ptr = NULL;
    MPI_Request *reqs = NULL;
    MPIR_Topology *topo_ptr = NULL;
    MPIR_Dist_graph_topology *dist_graph_ptr = NULL;
    int i;
    int j;
    int idx;
    int comm_size = 0;
    int in_capacity;
    int out_capacity;
    int **rout = NULL;
    int **rin = NULL;
    int *rin_sizes;
    int *rout_sizes;
    int *rin_idx;
    int *rout_idx;
    int *rs;
    int in_out_peers[2] = {-1, -1};
    int errflag = FALSE;
    MPIU_CHKLMEM_DECL(9);
    MPIU_CHKPMEM_DECL(1);
    MPID_MPI_STATE_DECL(MPID_STATE_MPI_DIST_GRAPH_CREATE);

    MPIR_ERRTEST_INITIALIZED_ORDIE();

    MPIU_THREAD_CS_ENTER(ALLFUNC,);
    MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_DIST_GRAPH_CREATE);

    /* Validate parameters, especially handles needing to be converted */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            MPIR_ERRTEST_COMM(comm_old, mpi_errno);
            MPIR_ERRTEST_INFO_OR_NULL(info, mpi_errno);
            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif

    /* Convert MPI object handles to object pointers */
    MPID_Comm_get_ptr(comm_old, comm_ptr);

    /* Validate parameters and objects (post conversion) */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            /* Validate comm_ptr */
            MPID_Comm_valid_ptr(comm_ptr, mpi_errno);
            /* If comm_ptr is not valid, it will be reset to null */
            if (comm_ptr) {
                MPIR_ERRTEST_COMM_INTRA(comm_ptr, mpi_errno);
            }

            MPIR_ERRTEST_ARGNEG(n, "n", mpi_errno);
            if (n > 0) {
                int have_degrees = 0;
                MPIR_ERRTEST_ARGNULL(sources, "sources", mpi_errno);
                MPIR_ERRTEST_ARGNULL(degrees, "degrees", mpi_errno);
                for (i = 0; i < n; ++i) {
                    if (degrees[i]) {
                        have_degrees = 1;
                        break;
                    }
                }
                if (have_degrees) {
                    MPIR_ERRTEST_ARGNULL(destinations, "destinations", mpi_errno);
                    if (weights != MPI_UNWEIGHTED)
                        MPIR_ERRTEST_ARGNULL(weights, "weights", mpi_errno);
                }
            }

            if (mpi_errno != MPI_SUCCESS) goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */


    /* ... body of routine ...  */
    /* Implementation based on Torsten Hoefler's reference implementation
     * attached to MPI-2.2 ticket #33. */
    *comm_dist_graph = MPI_COMM_NULL;

    comm_size = comm_ptr->local_size;

    /* following the spirit of the old topo interface, attributes do not
     * propagate to the new communicator (see MPI-2.1 pp. 243 line 11) */
    mpi_errno = MPIR_Comm_copy(comm_ptr, comm_size, &comm_dist_graph_ptr);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    MPIU_Assert(comm_dist_graph_ptr != NULL);

    /* rin is an array of size comm_size containing pointers to arrays of
     * rin_sizes[x].  rin[x] is locally known number of edges into this process
     * from rank x.
     *
     * rout is an array of comm_size containing pointers to arrays of
     * rout_sizes[x].  rout[x] is the locally known number of edges out of this
     * process to rank x. */
    MPIU_CHKLMEM_MALLOC(rout,       int **, comm_size*sizeof(int*), mpi_errno, "rout");
    MPIU_CHKLMEM_MALLOC(rin,        int **, comm_size*sizeof(int*), mpi_errno, "rin");
    MPIU_CHKLMEM_MALLOC(rin_sizes,  int *, comm_size*sizeof(int), mpi_errno, "rin_sizes");
    MPIU_CHKLMEM_MALLOC(rout_sizes, int *, comm_size*sizeof(int), mpi_errno, "rout_sizes");
    MPIU_CHKLMEM_MALLOC(rin_idx,    int *, comm_size*sizeof(int), mpi_errno, "rin_idx");
    MPIU_CHKLMEM_MALLOC(rout_idx,   int *, comm_size*sizeof(int), mpi_errno, "rout_idx");

    memset(rout,       0, comm_size*sizeof(int*));
    memset(rin,        0, comm_size*sizeof(int*));
    memset(rin_sizes,  0, comm_size*sizeof(int));
    memset(rout_sizes, 0, comm_size*sizeof(int));
    memset(rin_idx,    0, comm_size*sizeof(int));
    memset(rout_idx,   0, comm_size*sizeof(int));

    /* compute array sizes */
    idx = 0;
    for (i = 0; i < n; ++i) {
        MPIU_Assert(sources[i] < comm_size);
        for (j = 0; j < degrees[i]; ++j) {
            MPIU_Assert(destinations[idx] < comm_size);
            /* rout_sizes[i] is twice as long as the number of edges to be
             * sent to rank i by this process */
            rout_sizes[sources[i]] += 2;
            rin_sizes[destinations[idx]] += 2;
            ++idx;
        }
    }

    /* allocate arrays */
    for (i = 0; i < comm_size; ++i) {
        /* can't use CHKLMEM macros b/c we are in a loop */
        if (rin_sizes[i]) {
            rin[i] = MPIU_Malloc(rin_sizes[i] * sizeof(int));
        }
        if (rout_sizes[i]) {
            rout[i] = MPIU_Malloc(rout_sizes[i] * sizeof(int));
        }
    }

    /* populate arrays */
    idx = 0;
    for (i = 0; i < n; ++i) {
        /* TODO add this assert as proper error checking above */
        int s_rank = sources[i];
        MPIU_Assert(s_rank < comm_size);
        MPIU_Assert(s_rank >= 0);

        for (j = 0; j < degrees[i]; ++j) {
            int d_rank = destinations[idx];
            int weight = (weights == MPI_UNWEIGHTED ? 0 : weights[idx]);
            /* TODO add this assert as proper error checking above */
            MPIU_Assert(d_rank < comm_size);
            MPIU_Assert(d_rank >= 0);

            /* XXX DJG what about self-edges? do we need to drop one of these
             * cases when there is a self-edge to avoid double-counting? */

            /* rout[s][2*x] is the value of d for the j'th edge between (s,d)
             * with weight rout[s][2*x+1], where x is the current end of the
             * outgoing edge list for s.  x==(rout_idx[s]/2) */
            rout[s_rank][rout_idx[s_rank]++] = d_rank;
            rout[s_rank][rout_idx[s_rank]++] = weight;

            /* rin[d][2*x] is the value of s for the j'th edge between (s,d)
             * with weight rout[d][2*x+1], where x is the current end of the
             * incoming edge list for d.  x==(rin_idx[d]/2) */
            rin[d_rank][rin_idx[d_rank]++] = s_rank;
            rin[d_rank][rin_idx[d_rank]++] = weight;

            ++idx;
        }
    }

    for (i = 0; i < comm_size; ++i) {
        /* sanity check that all arrays are fully populated*/
        MPIU_Assert(rin_idx[i] == rin_sizes[i]);
        MPIU_Assert(rout_idx[i] == rout_sizes[i]);
    }

    MPIU_CHKLMEM_MALLOC(rs, int *, 2*comm_size*sizeof(int), mpi_errno, "red-scat source buffer");
    for (i = 0; i < comm_size; ++i) {
        rs[2*i]   = (rin_sizes[i]  ? 1 : 0);
        rs[2*i+1] = (rout_sizes[i] ? 1 : 0);
    }

    /* compute the number of peers I will recv from */
    mpi_errno = MPIR_Reduce_scatter_block_impl(rs, in_out_peers, 2, MPI_INT, MPI_SUM, comm_ptr, &errflag);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);
    MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    MPIU_Assert(in_out_peers[0] <= comm_size && in_out_peers[0] >= 0);
    MPIU_Assert(in_out_peers[1] <= comm_size && in_out_peers[1] >= 0);

    idx = 0;
    /* must be 2*comm_size requests because we will possibly send inbound and
     * outbound edges to everyone in our communicator */
    MPIU_CHKLMEM_MALLOC(reqs, MPI_Request *, 2*comm_size*sizeof(MPI_Request), mpi_errno, "temp request array");
    for (i = 0; i < comm_size; ++i) {
        if (rin_sizes[i]) {
            /* send edges where i is a destination to process i */
            mpi_errno = MPIC_Isend(&rin[i][0], rin_sizes[i], MPI_INT, i, MPIR_TOPO_A_TAG, comm_old, &reqs[idx++]);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        }
        if (rout_sizes[i]) {
            /* send edges where i is a source to process i */
            mpi_errno = MPIC_Isend(&rout[i][0], rout_sizes[i], MPI_INT, i, MPIR_TOPO_B_TAG, comm_old, &reqs[idx++]);
            if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        }
    }
    MPIU_Assert(idx <= (2 * comm_size));

    /* Create the topology structure */
    MPIU_CHKPMEM_MALLOC(topo_ptr, MPIR_Topology *, sizeof(MPIR_Topology), mpi_errno, "topo_ptr");
    topo_ptr->kind = MPI_DIST_GRAPH;
    dist_graph_ptr = &topo_ptr->topo.dist_graph;
    dist_graph_ptr->indegree = 0;
    dist_graph_ptr->in = NULL;
    dist_graph_ptr->in_weights = NULL;
    dist_graph_ptr->outdegree = 0;
    dist_graph_ptr->out = NULL;
    dist_graph_ptr->out_weights = NULL;
    dist_graph_ptr->is_weighted = (weights != MPI_UNWEIGHTED);

    /* can't use CHKPMEM macros for this b/c we need to realloc */
    in_capacity = 10; /* arbitrary */
    dist_graph_ptr->in = MPIU_Malloc(in_capacity*sizeof(int));
    if (dist_graph_ptr->is_weighted)
        dist_graph_ptr->in_weights = MPIU_Malloc(in_capacity*sizeof(int));
    out_capacity = 10; /* arbitrary */
    dist_graph_ptr->out = MPIU_Malloc(out_capacity*sizeof(int));
    if (dist_graph_ptr->is_weighted)
        dist_graph_ptr->out_weights = MPIU_Malloc(out_capacity*sizeof(int));

    for (i = 0; i < in_out_peers[0]; ++i) {
        MPI_Status status;
        int count;
        int *buf;
        /* receive inbound edges */
        mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_old, &status);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        MPIR_Get_count_impl(&status, MPI_INT, &count);
        /* can't use CHKLMEM macros b/c we are in a loop */
        buf = MPIU_Malloc(count*sizeof(int));
        MPIU_ERR_CHKANDJUMP(!buf, mpi_errno, MPIR_ERR_RECOVERABLE, "**nomem");

        mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_old, MPI_STATUS_IGNORE);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        
        for (j = 0; j < count/2; ++j) {
            int deg = dist_graph_ptr->indegree++;
            if (deg >= in_capacity) {
                in_capacity *= 2;
                MPIU_REALLOC_ORJUMP(dist_graph_ptr->in, in_capacity*sizeof(int), mpi_errno);
                if (dist_graph_ptr->is_weighted)
                    MPIU_REALLOC_ORJUMP(dist_graph_ptr->in_weights, in_capacity*sizeof(int), mpi_errno);
            }
            dist_graph_ptr->in[deg] = buf[2*j];
            if (dist_graph_ptr->is_weighted)
                dist_graph_ptr->in_weights[deg] = buf[2*j+1];
        }
        MPIU_Free(buf);
    }

    for (i = 0; i < in_out_peers[1]; ++i) {
        MPI_Status status;
        int count;
        int *buf;
        /* receive outbound edges */
        mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_old, &status);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);
        MPIR_Get_count_impl(&status, MPI_INT, &count);
        /* can't use CHKLMEM macros b/c we are in a loop */
        buf = MPIU_Malloc(count*sizeof(int));
        MPIU_ERR_CHKANDJUMP(!buf, mpi_errno, MPIR_ERR_RECOVERABLE, "**nomem");

        mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_old, MPI_STATUS_IGNORE);
        if (mpi_errno) MPIU_ERR_POP(mpi_errno);

        for (j = 0; j < count/2; ++j) {
            int deg = dist_graph_ptr->outdegree++;
            if (deg >= out_capacity) {
                out_capacity *= 2;
                MPIU_REALLOC_ORJUMP(dist_graph_ptr->out, out_capacity*sizeof(int), mpi_errno);
                if (dist_graph_ptr->is_weighted)
                    MPIU_REALLOC_ORJUMP(dist_graph_ptr->out_weights, out_capacity*sizeof(int), mpi_errno);
            }
            dist_graph_ptr->out[deg] = buf[2*j];
            if (dist_graph_ptr->is_weighted)
                dist_graph_ptr->out_weights[deg] = buf[2*j+1];
        }
        MPIU_Free(buf);
    }

    mpi_errno = MPIR_Waitall_impl(idx, reqs, MPI_STATUSES_IGNORE);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    /* remove any excess memory allocation */
    MPIU_REALLOC_ORJUMP(dist_graph_ptr->in, dist_graph_ptr->indegree*sizeof(int), mpi_errno);
    MPIU_REALLOC_ORJUMP(dist_graph_ptr->out, dist_graph_ptr->outdegree*sizeof(int), mpi_errno);
    if (dist_graph_ptr->is_weighted) {
        MPIU_REALLOC_ORJUMP(dist_graph_ptr->in_weights, dist_graph_ptr->indegree*sizeof(int), mpi_errno);
        MPIU_REALLOC_ORJUMP(dist_graph_ptr->out_weights, dist_graph_ptr->outdegree*sizeof(int), mpi_errno);
    }

    mpi_errno = MPIR_Topology_put(comm_dist_graph_ptr, topo_ptr);
    if (mpi_errno) MPIU_ERR_POP(mpi_errno);

    MPIU_CHKPMEM_COMMIT();

    MPIU_OBJ_PUBLISH_HANDLE(*comm_dist_graph, comm_dist_graph_ptr->handle);

    /* ... end of body of routine ... */

  fn_exit:
    for (i = 0; i < comm_size; ++i) {
        if (rin[i])
            MPIU_Free(rin[i]);
        if (rout[i])
            MPIU_Free(rout[i]);
    }

    MPIU_CHKLMEM_FREEALL();

    MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_DIST_GRAPH_CREATE);
    MPIU_THREAD_CS_EXIT(ALLFUNC,);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    if (dist_graph_ptr && dist_graph_ptr->in)
        MPIU_Free(dist_graph_ptr->in);
    if (dist_graph_ptr && dist_graph_ptr->in_weights)
        MPIU_Free(dist_graph_ptr->in_weights);
    if (dist_graph_ptr && dist_graph_ptr->out)
        MPIU_Free(dist_graph_ptr->out);
    if (dist_graph_ptr && dist_graph_ptr->out_weights)
        MPIU_Free(dist_graph_ptr->out_weights);
    MPIU_CHKPMEM_REAP();
#ifdef HAVE_ERROR_CHECKING
    mpi_errno = MPIR_Err_create_code(
        mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER,
        "**mpi_dist_graph_create", "**mpi_dist_graph_create %C %d %p %p %p %p %I %d %p",
        comm_old, n, sources, degrees, destinations, weights, info, reorder, comm_dist_graph);
#endif
    mpi_errno = MPIR_Err_return_comm(comm_ptr, FCNAME, mpi_errno);
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}