Exemple #1
0
int MPID_nem_ptl_init_id(MPIDI_VC_t *vc)
{
    int mpi_errno = MPI_SUCCESS;
    MPID_nem_ptl_vc_area *const vc_ptl = VC_PTL(vc);
    char *bc;
    int pmi_errno;
    int val_max_sz;
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_PTL_INIT_ID);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_PTL_INIT_ID);

    pmi_errno = PMI_KVS_Get_value_length_max(&val_max_sz);
    MPIR_ERR_CHKANDJUMP1(pmi_errno, mpi_errno, MPI_ERR_OTHER, "**fail", "**fail %d", pmi_errno);
    MPIR_CHKLMEM_MALLOC(bc, char *, val_max_sz, mpi_errno, "bc");

    mpi_errno = vc->pg->getConnInfo(vc->pg_rank, bc, val_max_sz, vc->pg);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    mpi_errno = MPID_nem_ptl_get_id_from_bc(bc, &vc_ptl->id, &vc_ptl->pt, &vc_ptl->ptg, &vc_ptl->ptc, &vc_ptl->ptr, &vc_ptl->ptrg, &vc_ptl->ptrc);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    vc_ptl->id_initialized = TRUE;

    MPIDI_CHANGE_VC_STATE(vc, ACTIVE);
    
 fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_PTL_INIT_ID);
    return mpi_errno;
 fn_fail:
    goto fn_exit;
}
Exemple #2
0
static int MPIDI_CH3I_Win_detect_shm(MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Win *shm_win_ptr = NULL;
    int i, node_size;
    MPI_Aint *base_shm_offs;

    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);
    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        goto fn_exit;
    }

    node_size = (*win_ptr)->comm_ptr->node_comm->local_size;

    MPIR_CHKLMEM_MALLOC(base_shm_offs, MPI_Aint *, node_size * sizeof(MPI_Aint),
                        mpi_errno, "base_shm_offs");

    /* Return the first matched shared window.
     * It is noted that the shared windows including all local processes are
     * stored in every local process in the same order, hence the first matched
     * shared window on every local process should be the same. */
    mpi_errno = MPIDI_CH3I_SHM_Wins_match(win_ptr, &shm_win_ptr, &base_shm_offs);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    if (shm_win_ptr == NULL)
        goto fn_exit;

    (*win_ptr)->shm_allocated = TRUE;
    MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");

    /* Compute the base address of shm buffer on each process.
     * shm_base_addrs[i] = my_shm_base_addr + off[i] */
    for (i = 0; i < node_size; i++) {
        (*win_ptr)->shm_base_addrs[i] =
            (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i]);
    }

    /* TODO: should we use the same mutex or create a new one ?
     * It causes unnecessary synchronization.*/
    (*win_ptr)->shm_mutex = shm_win_ptr->shm_mutex;

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
int MPIR_Ineighbor_alltoallw_sched_allcomm_linear(const void *sendbuf, const int sendcounts[],
                                                  const MPI_Aint sdispls[],
                                                  const MPI_Datatype sendtypes[], void *recvbuf,
                                                  const int recvcounts[], const MPI_Aint rdispls[],
                                                  const MPI_Datatype recvtypes[],
                                                  MPIR_Comm * comm_ptr, MPIR_Sched_t s)
{
    int mpi_errno = MPI_SUCCESS;
    int indegree, outdegree, weighted;
    int k, l;
    int *srcs, *dsts;
    MPIR_CHKLMEM_DECL(2);

    mpi_errno = MPIR_Topo_canon_nhb_count(comm_ptr, &indegree, &outdegree, &weighted);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_CHKLMEM_MALLOC(srcs, int *, indegree * sizeof(int), mpi_errno, "srcs", MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(dsts, int *, outdegree * sizeof(int), mpi_errno, "dsts", MPL_MEM_COMM);
    mpi_errno = MPIR_Topo_canon_nhb(comm_ptr,
                                    indegree, srcs, MPI_UNWEIGHTED,
                                    outdegree, dsts, MPI_UNWEIGHTED);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    for (k = 0; k < outdegree; ++k) {
        char *sb;
        MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf + sdispls[k]);

        sb = ((char *) sendbuf) + sdispls[k];
        mpi_errno = MPIR_Sched_send(sb, sendcounts[k], sendtypes[k], dsts[k], comm_ptr, s);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    for (l = 0; l < indegree; ++l) {
        char *rb;
        MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf + rdispls[l]);

        rb = ((char *) recvbuf) + rdispls[l];
        mpi_errno = MPIR_Sched_recv(rb, recvcounts[l], recvtypes[l], srcs[l], comm_ptr, s);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    MPIR_SCHED_BARRIER(s);

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
int MPIR_Ineighbor_allgatherv_sched_allcomm_linear(const void *sendbuf, int sendcount,
                                                   MPI_Datatype sendtype, void *recvbuf,
                                                   const int recvcounts[], const int displs[],
                                                   MPI_Datatype recvtype, MPIR_Comm * comm_ptr,
                                                   MPIR_Sched_t s)
{
    int mpi_errno = MPI_SUCCESS;
    int indegree, outdegree, weighted;
    int k, l;
    int *srcs, *dsts;
    MPI_Aint recvtype_extent;
    MPIR_CHKLMEM_DECL(2);

    MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent);

    mpi_errno = MPIR_Topo_canon_nhb_count(comm_ptr, &indegree, &outdegree, &weighted);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_CHKLMEM_MALLOC(srcs, int *, indegree * sizeof(int), mpi_errno, "srcs", MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(dsts, int *, outdegree * sizeof(int), mpi_errno, "dsts", MPL_MEM_COMM);
    mpi_errno = MPIR_Topo_canon_nhb(comm_ptr,
                                    indegree, srcs, MPI_UNWEIGHTED,
                                    outdegree, dsts, MPI_UNWEIGHTED);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    for (k = 0; k < outdegree; ++k) {
        mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, dsts[k], comm_ptr, s);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    for (l = 0; l < indegree; ++l) {
        char *rb = ((char *) recvbuf) + displs[l] * recvtype_extent;
        mpi_errno = MPIR_Sched_recv(rb, recvcounts[l], recvtype, srcs[l], comm_ptr, s);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }

    MPIR_SCHED_BARRIER(s);

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #5
0
int MPIR_Type_indexed_impl(int count, const int *array_of_blocklengths,
                           const int *array_of_displacements,
                           MPI_Datatype oldtype, MPI_Datatype * newtype)
{
    int mpi_errno = MPI_SUCCESS;
    MPI_Datatype new_handle;
    MPIR_Datatype *new_dtp;
    int i, *ints;
    MPIR_CHKLMEM_DECL(1);

    mpi_errno = MPIR_Type_indexed(count, array_of_blocklengths, array_of_displacements, 0,      /* displacements not in bytes */
                                  oldtype, &new_handle);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* copy all integer values into a temporary buffer; this
     * includes the count, the blocklengths, and the displacements.
     */
    MPIR_CHKLMEM_MALLOC(ints, int *, (2 * count + 1) * sizeof(int), mpi_errno,
                        "contents integer array", MPL_MEM_BUFFER);

    ints[0] = count;

    for (i = 0; i < count; i++) {
        ints[i + 1] = array_of_blocklengths[i];
    }
    for (i = 0; i < count; i++) {
        ints[i + count + 1] = array_of_displacements[i];
    }
    MPIR_Datatype_get_ptr(new_handle, new_dtp);
    mpi_errno = MPIR_Datatype_set_contents(new_dtp, MPI_COMBINER_INDEXED, 2 * count + 1,        /* ints */
                                           0,   /* aints  */
                                           1,   /* types */
                                           ints, NULL, &oldtype);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIR_OBJ_PUBLISH_HANDLE(*newtype, new_handle);

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Exemple #6
0
int MPII_Genutil_sched_sink(MPII_Genutil_sched_t * sched)
{
    MPL_DBG_MSG_FMT(MPIR_DBG_COLL, VERBOSE,
                    (MPL_DBG_FDEST, "Gentran: sched [sink] total=%d", sched->total_vtcs));

    vtx_t *vtxp;
    int i, n_in_vtcs = 0, vtx_id;
    int *in_vtcs;
    int mpi_errno = MPI_SUCCESS;

    MPIR_CHKLMEM_DECL(1);

    /* assign a new vertex */
    vtx_id = MPII_Genutil_vtx_create(sched, &vtxp);

    vtxp->vtx_kind = MPII_GENUTIL_VTX_KIND__SINK;

    MPIR_CHKLMEM_MALLOC(in_vtcs, int *, sizeof(int) * vtx_id,
                        mpi_errno, "in_vtcs buffer", MPL_MEM_COLL);
    /* record incoming vertices */
    for (i = vtx_id - 1; i >= 0; i--) {
        vtx_t *sched_fence = (vtx_t *) utarray_eltptr(sched->vtcs, i);
        MPIR_Assert(sched_fence != NULL);
        if (sched_fence->vtx_kind == MPII_GENUTIL_VTX_KIND__FENCE)
            /* no need to record this and any vertex before fence.
             * Dependency on the last fence call will be added by
             * the subsequent call to MPIC_Genutil_vtx_add_dependencies function */
            break;
        else {
            in_vtcs[vtx_id - 1 - i] = i;
            n_in_vtcs++;
        }
    }

    MPII_Genutil_vtx_add_dependencies(sched, vtx_id, n_in_vtcs, in_vtcs);
  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return vtx_id;
  fn_fail:
    goto fn_exit;
}
Exemple #7
0
/* MPIR_Find_external -- from the list of processes in comm,
 * builds a list of external processes, i.e., one process from each node.
 * You can think of this as the root or master process for each node.
 *
 * Note that this will not work correctly for spawned or attached
 * processes.
 *
 *  OUT:
 *    external_size_p   - number of external processes
 *    external_rank_p   - rank of this process among the external
 *                        processes, or -1 if this process is not external
 *    external_ranks_p  - (*external_ranks_p)[i]   = the rank in comm
 *                        of the process with external rank i.
 *                        This is of size (*external_size_p)
 *    internode_table_p - (*internode_table_p)[i]  = the rank in
 *    (optional)          *external_ranks_p of the root of the node
 *                        containing rank i in comm.  It is of size
 *                        comm->remote_size. No return if NULL is specified.
 */
int MPIR_Find_external(MPIR_Comm * comm, int *external_size_p, int *external_rank_p,
                       int **external_ranks_p, int **internode_table_p)
{
    int mpi_errno = MPI_SUCCESS;
    int *nodes;
    int i, external_size, external_rank;
    int *external_ranks, *internode_table;
    int max_node_id, node_id;

    MPIR_CHKLMEM_DECL(1);
    MPIR_CHKPMEM_DECL(2);

    /* Scan through the list of processes in comm and add one
     * process from each node to the list of "external" processes.  We
     * add the first process we find from each node.  nodes[] is an
     * array where we keep track of whether we have already added that
     * node to the list. */

    /* external_ranks will be realloc'ed later to the appropriate size (currently unknown) */
    /* FIXME: realloc doesn't guarantee that the allocated area will be
     * shrunk - so using realloc is not an appropriate strategy. */
    MPIR_CHKPMEM_MALLOC(external_ranks, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "external_ranks", MPL_MEM_COMM);
    MPIR_CHKPMEM_MALLOC(internode_table, int *, sizeof(int) * comm->remote_size, mpi_errno,
                        "internode_table", MPL_MEM_COMM);

    mpi_errno = MPID_Get_max_node_id(comm, &max_node_id);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_Assert(max_node_id >= 0);
    MPIR_CHKLMEM_MALLOC(nodes, int *, sizeof(int) * (max_node_id + 1), mpi_errno, "nodes",
                        MPL_MEM_COMM);

    /* nodes maps node_id to rank in external_ranks of leader for that node */
    for (i = 0; i < (max_node_id + 1); ++i)
        nodes[i] = -1;

    external_size = 0;
    external_rank = -1;

    for (i = 0; i < comm->remote_size; ++i) {
        mpi_errno = MPID_Get_node_id(comm, i, &node_id);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* The upper level can catch this non-fatal error and should be
         * able to recover gracefully. */
        MPIR_ERR_CHKANDJUMP(node_id < 0, mpi_errno, MPI_ERR_OTHER, "**dynamic_node_ids");

        MPIR_Assert(node_id <= max_node_id);

        /* build list of external processes */
        if (nodes[node_id] == -1) {
            if (i == comm->rank)
                external_rank = external_size;
            nodes[node_id] = external_size;
            external_ranks[external_size] = i;
            ++external_size;
        }

        /* build the map from rank in comm to rank in external_ranks */
        internode_table[i] = nodes[node_id];
    }

#ifdef ENABLE_DEBUG
    printf("------------------------------------------------------------------------\n");
    printf("[%d]comm = %p\n", comm->rank, comm);
    printf("[%d]comm->size = %d\n", comm->rank, comm->remote_size);
    printf("[%d]comm->rank = %d\n", comm->rank, comm->rank);
    printf("[%d]external_size = %d\n", comm->rank, external_size);
    printf("[%d]external_rank = %d\n", comm->rank, external_rank);
    printf("[%d]external_ranks = %p\n", comm->rank, external_ranks);
    for (i = 0; i < external_size; ++i)
        printf("[%d]  external_ranks[%d] = %d\n", comm->rank, i, external_ranks[i]);
    printf("[%d]internode_table = %p\n", comm->rank, internode_table);
    for (i = 0; i < comm->remote_size; ++i)
        printf("[%d]  internode_table[%d] = %d\n", comm->rank, i, internode_table[i]);
    printf("[%d]nodes = %p\n", comm->rank, nodes);
    for (i = 0; i < (max_node_id + 1); ++i)
        printf("[%d]  nodes[%d] = %d\n", comm->rank, i, nodes[i]);
#endif

    MPIR_CHKPMEM_COMMIT();

    *external_size_p = external_size;
    *external_rank_p = external_rank;
    *external_ranks_p = MPL_realloc(external_ranks, sizeof(int) * external_size, MPL_MEM_COMM);
    MPIR_ERR_CHKANDJUMP(*external_ranks_p == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem2");

    if (internode_table_p)
        *internode_table_p = internode_table;   /* no need to realloc */
    else
        MPL_free(internode_table);      /* free internally if caller passes NULL */

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}
int MPIR_Alltoall_intra_scattered(
    const void *sendbuf,
    int sendcount, 
    MPI_Datatype sendtype, 
    void *recvbuf, 
    int recvcount, 
    MPI_Datatype recvtype, 
    MPIR_Comm *comm_ptr,
    MPIR_Errflag_t *errflag )
{
    int          comm_size, i, j;
    MPI_Aint     sendtype_extent, recvtype_extent;
    int mpi_errno=MPI_SUCCESS, dst, rank;
    int mpi_errno_ret = MPI_SUCCESS;
    MPI_Datatype newtype = MPI_DATATYPE_NULL;
    MPIR_Request **reqarray;
    MPI_Status *starray;
    MPIR_CHKLMEM_DECL(6);

    if (recvcount == 0) return MPI_SUCCESS;

    comm_size = comm_ptr->local_size;
    rank = comm_ptr->rank;

#ifdef HAVE_ERROR_CHECKING
    MPIR_Assert(sendbuf != MPI_IN_PLACE);
#endif /* HAVE_ERROR_CHECKING */

    /* Get extent of send and recv types */
    MPIR_Datatype_get_extent_macro(recvtype, recvtype_extent);
    MPIR_Datatype_get_extent_macro(sendtype, sendtype_extent);
    int ii, ss, bblock;

    bblock = MPIR_CVAR_ALLTOALL_THROTTLE;
    if (bblock == 0) bblock = comm_size;

    MPIR_CHKLMEM_MALLOC(reqarray, MPIR_Request **, 2*bblock*sizeof(MPIR_Request*), mpi_errno, "reqarray", MPL_MEM_BUFFER);

    MPIR_CHKLMEM_MALLOC(starray, MPI_Status *, 2*bblock*sizeof(MPI_Status), mpi_errno, "starray", MPL_MEM_BUFFER);

    for (ii=0; ii<comm_size; ii+=bblock) {
        ss = comm_size-ii < bblock ? comm_size-ii : bblock;
        /* do the communication -- post ss sends and receives: */
        for ( i=0; i<ss; i++ ) { 
            dst = (rank+i+ii) % comm_size;
            mpi_errno = MPIC_Irecv((char *)recvbuf +
                                      dst*recvcount*recvtype_extent, 
                                      recvcount, recvtype, dst,
                                      MPIR_ALLTOALL_TAG, comm_ptr,
                                      &reqarray[i]);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        }

        for ( i=0; i<ss; i++ ) { 
            dst = (rank-i-ii+comm_size) % comm_size;
            mpi_errno = MPIC_Isend((char *)sendbuf +
                                      dst*sendcount*sendtype_extent, 
                                      sendcount, sendtype, dst,
                                      MPIR_ALLTOALL_TAG, comm_ptr,
                                      &reqarray[i+ss], errflag);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        }

        /* ... then wait for them to finish: */
        mpi_errno = MPIC_Waitall(2*ss,reqarray,starray, errflag);
        if (mpi_errno && mpi_errno != MPI_ERR_IN_STATUS) MPIR_ERR_POP(mpi_errno);
        
        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno == MPI_ERR_IN_STATUS) {
            for (j=0; j<2*ss; j++) {
                if (starray[j].MPI_ERROR != MPI_SUCCESS) {
                    mpi_errno = starray[j].MPI_ERROR;
                    if (mpi_errno) {
                        /* for communication errors, just record the error but continue */
                        *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
                        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
                        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
                    }
                }
            }
        }
        /* --END ERROR HANDLING-- */
    }

 fn_exit:
    MPIR_CHKLMEM_FREEALL();
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");

    return mpi_errno;
 fn_fail:
    if (newtype != MPI_DATATYPE_NULL)
        MPIR_Type_free_impl(&newtype);
    goto fn_exit;
}
Exemple #9
0
/*@
MPI_Dist_graph_create - MPI_DIST_GRAPH_CREATE returns a handle to a new
communicator to which the distributed graph topology information is
attached.

Input Parameters:
+ comm_old - input communicator (handle)
. n - number of source nodes for which this process specifies edges
  (non-negative integer)
. sources - array containing the n source nodes for which this process
  specifies edges (array of non-negative integers)
. degrees - array specifying the number of destinations for each source node
  in the source node array (array of non-negative integers)
. destinations - destination nodes for the source nodes in the source node
  array (array of non-negative integers)
. weights - weights for source to destination edges (array of non-negative
  integers or MPI_UNWEIGHTED)
. info - hints on optimization and interpretation of weights (handle)
- reorder - the process may be reordered (true) or not (false) (logical)

Output Parameters:
. comm_dist_graph - communicator with distributed graph topology added (handle)

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_ARG
.N MPI_ERR_OTHER
@*/
int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[],
                          const int degrees[], const int destinations[],
                          const int weights[],
                          MPI_Info info, int reorder, MPI_Comm * comm_dist_graph)
{
    int mpi_errno = MPI_SUCCESS;
    MPIR_Comm *comm_ptr = NULL;
    MPIR_Comm *comm_dist_graph_ptr = NULL;
    MPIR_Request **reqs = NULL;
    MPIR_Topology *topo_ptr = NULL;
    MPII_Dist_graph_topology *dist_graph_ptr = NULL;
    int i;
    int j;
    int idx;
    int comm_size = 0;
    int in_capacity;
    int out_capacity;
    int **rout = NULL;
    int **rin = NULL;
    int *rin_sizes;
    int *rout_sizes;
    int *rin_idx;
    int *rout_idx;
    int *rs;
    int in_out_peers[2] = { -1, -1 };
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_CHKLMEM_DECL(9);
    MPIR_CHKPMEM_DECL(1);
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_DIST_GRAPH_CREATE);

    MPIR_ERRTEST_INITIALIZED_ORDIE();

    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_DIST_GRAPH_CREATE);

    /* Validate parameters, especially handles needing to be converted */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            MPIR_ERRTEST_COMM(comm_old, mpi_errno);
            MPIR_ERRTEST_INFO_OR_NULL(info, mpi_errno);
            if (mpi_errno != MPI_SUCCESS)
                goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#endif

    /* Convert MPI object handles to object pointers */
    MPIR_Comm_get_ptr(comm_old, comm_ptr);

    /* Validate parameters and objects (post conversion) */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            /* Validate comm_ptr */
            MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE);
            /* If comm_ptr is not valid, it will be reset to null */
            if (comm_ptr) {
                MPIR_ERRTEST_COMM_INTRA(comm_ptr, mpi_errno);
            }

            MPIR_ERRTEST_ARGNEG(n, "n", mpi_errno);
            if (n > 0) {
                int have_degrees = 0;
                MPIR_ERRTEST_ARGNULL(sources, "sources", mpi_errno);
                MPIR_ERRTEST_ARGNULL(degrees, "degrees", mpi_errno);
                for (i = 0; i < n; ++i) {
                    if (degrees[i]) {
                        have_degrees = 1;
                        break;
                    }
                }
                if (have_degrees) {
                    MPIR_ERRTEST_ARGNULL(destinations, "destinations", mpi_errno);
                    if (weights != MPI_UNWEIGHTED)
                        MPIR_ERRTEST_ARGNULL(weights, "weights", mpi_errno);
                }
            }

            if (mpi_errno != MPI_SUCCESS)
                goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#endif /* HAVE_ERROR_CHECKING */


    /* ... body of routine ...  */
    /* Implementation based on Torsten Hoefler's reference implementation
     * attached to MPI-2.2 ticket #33. */
    *comm_dist_graph = MPI_COMM_NULL;

    comm_size = comm_ptr->local_size;

    /* following the spirit of the old topo interface, attributes do not
     * propagate to the new communicator (see MPI-2.1 pp. 243 line 11) */
    mpi_errno = MPII_Comm_copy(comm_ptr, comm_size, &comm_dist_graph_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_Assert(comm_dist_graph_ptr != NULL);

    /* rin is an array of size comm_size containing pointers to arrays of
     * rin_sizes[x].  rin[x] is locally known number of edges into this process
     * from rank x.
     *
     * rout is an array of comm_size containing pointers to arrays of
     * rout_sizes[x].  rout[x] is the locally known number of edges out of this
     * process to rank x. */
    MPIR_CHKLMEM_MALLOC(rout, int **, comm_size * sizeof(int *), mpi_errno, "rout", MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(rin, int **, comm_size * sizeof(int *), mpi_errno, "rin", MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(rin_sizes, int *, comm_size * sizeof(int), mpi_errno, "rin_sizes",
                        MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(rout_sizes, int *, comm_size * sizeof(int), mpi_errno, "rout_sizes",
                        MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(rin_idx, int *, comm_size * sizeof(int), mpi_errno, "rin_idx",
                        MPL_MEM_COMM);
    MPIR_CHKLMEM_MALLOC(rout_idx, int *, comm_size * sizeof(int), mpi_errno, "rout_idx",
                        MPL_MEM_COMM);

    memset(rout, 0, comm_size * sizeof(int *));
    memset(rin, 0, comm_size * sizeof(int *));
    memset(rin_sizes, 0, comm_size * sizeof(int));
    memset(rout_sizes, 0, comm_size * sizeof(int));
    memset(rin_idx, 0, comm_size * sizeof(int));
    memset(rout_idx, 0, comm_size * sizeof(int));

    /* compute array sizes */
    idx = 0;
    for (i = 0; i < n; ++i) {
        MPIR_Assert(sources[i] < comm_size);
        for (j = 0; j < degrees[i]; ++j) {
            MPIR_Assert(destinations[idx] < comm_size);
            /* rout_sizes[i] is twice as long as the number of edges to be
             * sent to rank i by this process */
            rout_sizes[sources[i]] += 2;
            rin_sizes[destinations[idx]] += 2;
            ++idx;
        }
    }

    /* allocate arrays */
    for (i = 0; i < comm_size; ++i) {
        /* can't use CHKLMEM macros b/c we are in a loop */
        if (rin_sizes[i]) {
            rin[i] = MPL_malloc(rin_sizes[i] * sizeof(int), MPL_MEM_COMM);
        }
        if (rout_sizes[i]) {
            rout[i] = MPL_malloc(rout_sizes[i] * sizeof(int), MPL_MEM_COMM);
        }
    }

    /* populate arrays */
    idx = 0;
    for (i = 0; i < n; ++i) {
        /* TODO add this assert as proper error checking above */
        int s_rank = sources[i];
        MPIR_Assert(s_rank < comm_size);
        MPIR_Assert(s_rank >= 0);

        for (j = 0; j < degrees[i]; ++j) {
            int d_rank = destinations[idx];
            int weight = (weights == MPI_UNWEIGHTED ? 0 : weights[idx]);
            /* TODO add this assert as proper error checking above */
            MPIR_Assert(d_rank < comm_size);
            MPIR_Assert(d_rank >= 0);

            /* XXX DJG what about self-edges? do we need to drop one of these
             * cases when there is a self-edge to avoid double-counting? */

            /* rout[s][2*x] is the value of d for the j'th edge between (s,d)
             * with weight rout[s][2*x+1], where x is the current end of the
             * outgoing edge list for s.  x==(rout_idx[s]/2) */
            rout[s_rank][rout_idx[s_rank]++] = d_rank;
            rout[s_rank][rout_idx[s_rank]++] = weight;

            /* rin[d][2*x] is the value of s for the j'th edge between (s,d)
             * with weight rout[d][2*x+1], where x is the current end of the
             * incoming edge list for d.  x==(rin_idx[d]/2) */
            rin[d_rank][rin_idx[d_rank]++] = s_rank;
            rin[d_rank][rin_idx[d_rank]++] = weight;

            ++idx;
        }
    }

    for (i = 0; i < comm_size; ++i) {
        /* sanity check that all arrays are fully populated */
        MPIR_Assert(rin_idx[i] == rin_sizes[i]);
        MPIR_Assert(rout_idx[i] == rout_sizes[i]);
    }

    MPIR_CHKLMEM_MALLOC(rs, int *, 2 * comm_size * sizeof(int), mpi_errno, "red-scat source buffer",
                        MPL_MEM_COMM);
    for (i = 0; i < comm_size; ++i) {
        rs[2 * i] = (rin_sizes[i] ? 1 : 0);
        rs[2 * i + 1] = (rout_sizes[i] ? 1 : 0);
    }

    /* compute the number of peers I will recv from */
    mpi_errno =
        MPIR_Reduce_scatter_block(rs, in_out_peers, 2, MPI_INT, MPI_SUM, comm_ptr, &errflag);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    MPIR_Assert(in_out_peers[0] <= comm_size && in_out_peers[0] >= 0);
    MPIR_Assert(in_out_peers[1] <= comm_size && in_out_peers[1] >= 0);

    idx = 0;
    /* must be 2*comm_size requests because we will possibly send inbound and
     * outbound edges to everyone in our communicator */
    MPIR_CHKLMEM_MALLOC(reqs, MPIR_Request **, 2 * comm_size * sizeof(MPIR_Request *), mpi_errno,
                        "temp request array", MPL_MEM_COMM);
    for (i = 0; i < comm_size; ++i) {
        if (rin_sizes[i]) {
            /* send edges where i is a destination to process i */
            mpi_errno =
                MPIC_Isend(&rin[i][0], rin_sizes[i], MPI_INT, i, MPIR_TOPO_A_TAG, comm_ptr,
                           &reqs[idx++], &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
        if (rout_sizes[i]) {
            /* send edges where i is a source to process i */
            mpi_errno =
                MPIC_Isend(&rout[i][0], rout_sizes[i], MPI_INT, i, MPIR_TOPO_B_TAG, comm_ptr,
                           &reqs[idx++], &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
    }
    MPIR_Assert(idx <= (2 * comm_size));

    /* Create the topology structure */
    MPIR_CHKPMEM_MALLOC(topo_ptr, MPIR_Topology *, sizeof(MPIR_Topology), mpi_errno, "topo_ptr",
                        MPL_MEM_COMM);
    topo_ptr->kind = MPI_DIST_GRAPH;
    dist_graph_ptr = &topo_ptr->topo.dist_graph;
    dist_graph_ptr->indegree = 0;
    dist_graph_ptr->in = NULL;
    dist_graph_ptr->in_weights = NULL;
    dist_graph_ptr->outdegree = 0;
    dist_graph_ptr->out = NULL;
    dist_graph_ptr->out_weights = NULL;
    dist_graph_ptr->is_weighted = (weights != MPI_UNWEIGHTED);

    /* can't use CHKPMEM macros for this b/c we need to realloc */
    in_capacity = 10;   /* arbitrary */
    dist_graph_ptr->in = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM);
    if (dist_graph_ptr->is_weighted) {
        dist_graph_ptr->in_weights = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM);
        MPIR_Assert(dist_graph_ptr->in_weights != NULL);
    }
    out_capacity = 10;  /* arbitrary */
    dist_graph_ptr->out = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM);
    if (dist_graph_ptr->is_weighted) {
        dist_graph_ptr->out_weights = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM);
        MPIR_Assert(dist_graph_ptr->out_weights);
    }

    for (i = 0; i < in_out_peers[0]; ++i) {
        MPI_Status status;
        MPI_Aint count;
        int *buf;
        /* receive inbound edges */
        mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_old, &status);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_Get_count_impl(&status, MPI_INT, &count);
        /* can't use CHKLMEM macros b/c we are in a loop */
        /* FIXME: Why not - there is only one allocated at a time. Is it only
         * that there is no defined macro to pop and free an item? */
        buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM);
        MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem");

        mpi_errno =
            MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_ptr,
                      MPI_STATUS_IGNORE, &errflag);
        /* FIXME: buf is never freed on error! */
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        for (j = 0; j < count / 2; ++j) {
            int deg = dist_graph_ptr->indegree++;
            if (deg >= in_capacity) {
                in_capacity *= 2;
                /* FIXME: buf is never freed on error! */
                MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, in_capacity * sizeof(int), MPL_MEM_COMM,
                                    mpi_errno);
                if (dist_graph_ptr->is_weighted)
                    /* FIXME: buf is never freed on error! */
                    MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, in_capacity * sizeof(int),
                                        MPL_MEM_COMM, mpi_errno);
            }
            dist_graph_ptr->in[deg] = buf[2 * j];
            if (dist_graph_ptr->is_weighted)
                dist_graph_ptr->in_weights[deg] = buf[2 * j + 1];
        }
        MPL_free(buf);
    }

    for (i = 0; i < in_out_peers[1]; ++i) {
        MPI_Status status;
        MPI_Aint count;
        int *buf;
        /* receive outbound edges */
        mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_old, &status);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_Get_count_impl(&status, MPI_INT, &count);
        /* can't use CHKLMEM macros b/c we are in a loop */
        /* Why not? */
        buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM);
        MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem");

        mpi_errno =
            MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_ptr,
                      MPI_STATUS_IGNORE, &errflag);
        /* FIXME: buf is never freed on error! */
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        for (j = 0; j < count / 2; ++j) {
            int deg = dist_graph_ptr->outdegree++;
            if (deg >= out_capacity) {
                out_capacity *= 2;
                /* FIXME: buf is never freed on error! */
                MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, out_capacity * sizeof(int), MPL_MEM_COMM,
                                    mpi_errno);
                if (dist_graph_ptr->is_weighted)
                    /* FIXME: buf is never freed on error! */
                    MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, out_capacity * sizeof(int),
                                        MPL_MEM_COMM, mpi_errno);
            }
            dist_graph_ptr->out[deg] = buf[2 * j];
            if (dist_graph_ptr->is_weighted)
                dist_graph_ptr->out_weights[deg] = buf[2 * j + 1];
        }
        MPL_free(buf);
    }

    mpi_errno = MPIC_Waitall(idx, reqs, MPI_STATUSES_IGNORE, &errflag);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    /* remove any excess memory allocation */
    MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, dist_graph_ptr->indegree * sizeof(int), MPL_MEM_COMM,
                        mpi_errno);
    MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, dist_graph_ptr->outdegree * sizeof(int), MPL_MEM_COMM,
                        mpi_errno);
    if (dist_graph_ptr->is_weighted) {
        MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, dist_graph_ptr->indegree * sizeof(int),
                            MPL_MEM_COMM, mpi_errno);
        MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, dist_graph_ptr->outdegree * sizeof(int),
                            MPL_MEM_COMM, mpi_errno);
    }

    mpi_errno = MPIR_Topology_put(comm_dist_graph_ptr, topo_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIR_CHKPMEM_COMMIT();

    MPIR_OBJ_PUBLISH_HANDLE(*comm_dist_graph, comm_dist_graph_ptr->handle);

    /* ... end of body of routine ... */

  fn_exit:
    for (i = 0; i < comm_size; ++i) {
        MPL_free(rin[i]);
        MPL_free(rout[i]);
    }

    MPIR_CHKLMEM_FREEALL();

    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_DIST_GRAPH_CREATE);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    if (dist_graph_ptr) {
        MPL_free(dist_graph_ptr->in);
        MPL_free(dist_graph_ptr->in_weights);
        MPL_free(dist_graph_ptr->out);
        MPL_free(dist_graph_ptr->out_weights);
    }
    MPIR_CHKPMEM_REAP();
#ifdef HAVE_ERROR_CHECKING
    mpi_errno =
        MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
                             "**mpi_dist_graph_create",
                             "**mpi_dist_graph_create %C %d %p %p %p %p %I %d %p", comm_old, n,
                             sources, degrees, destinations, weights, info, reorder,
                             comm_dist_graph);
#endif
    mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno);
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
/*@
   MPI_Type_create_darray - Create a datatype representing a distributed array

Input Parameters:
+ size - size of process group (positive integer)
. rank - rank in process group (nonnegative integer)
. ndims - number of array dimensions as well as process grid dimensions (positive integer)
. array_of_gsizes - number of elements of type oldtype in each dimension of global array (array of positive integers)
. array_of_distribs - distribution of array in each dimension (array of state)
. array_of_dargs - distribution argument in each dimension (array of positive integers)
. array_of_psizes - size of process grid in each dimension (array of positive integers)
. order - array storage order flag (state)
- oldtype - old datatype (handle)

Output Parameters:
. newtype - new datatype (handle)

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_TYPE
.N MPI_ERR_ARG
@*/
int MPI_Type_create_darray(int size,
                           int rank,
                           int ndims,
                           const int array_of_gsizes[],
                           const int array_of_distribs[],
                           const int array_of_dargs[],
                           const int array_of_psizes[],
                           int order, MPI_Datatype oldtype, MPI_Datatype * newtype)
{
    int mpi_errno = MPI_SUCCESS, i;
    MPI_Datatype new_handle;

    int procs, tmp_rank, tmp_size, *coords;
    MPI_Aint *st_offsets, orig_extent, disps[3];
    MPI_Datatype type_old, type_new = MPI_DATATYPE_NULL, tmp_type;

#ifdef HAVE_ERROR_CHECKING
    MPI_Aint size_with_aint;
    MPI_Offset size_with_offset;
#endif

    int *ints;
    MPIR_Datatype *datatype_ptr = NULL;
    MPIR_CHKLMEM_DECL(3);
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_TYPE_CREATE_DARRAY);

    MPIR_ERRTEST_INITIALIZED_ORDIE();

    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_TYPE_CREATE_DARRAY);

    /* Validate parameters, especially handles needing to be converted */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            MPIR_ERRTEST_DATATYPE(oldtype, "datatype", mpi_errno);
        }
        MPID_END_ERROR_CHECKS;
    }
#endif

    /* Convert MPI object handles to object pointers */
    MPIR_Datatype_get_ptr(oldtype, datatype_ptr);
    MPIR_Datatype_get_extent_macro(oldtype, orig_extent);

    /* Validate parameters and objects (post conversion) */
#ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
            /* Check parameters */
            MPIR_ERRTEST_ARGNONPOS(size, "size", mpi_errno, MPI_ERR_ARG);
            /* use MPI_ERR_RANK class for PE-MPI compatibility */
            MPIR_ERR_CHKANDJUMP3((rank < 0 || rank >= size), mpi_errno, MPI_ERR_RANK,
                                 "**argrange", "**argrange %s %d %d", "rank", rank, (size - 1));
            MPIR_ERRTEST_ARGNONPOS(ndims, "ndims", mpi_errno, MPI_ERR_DIMS);

            MPIR_ERRTEST_ARGNULL(array_of_gsizes, "array_of_gsizes", mpi_errno);
            MPIR_ERRTEST_ARGNULL(array_of_distribs, "array_of_distribs", mpi_errno);
            MPIR_ERRTEST_ARGNULL(array_of_dargs, "array_of_dargs", mpi_errno);
            MPIR_ERRTEST_ARGNULL(array_of_psizes, "array_of_psizes", mpi_errno);
            if (order != MPI_ORDER_C && order != MPI_ORDER_FORTRAN) {
                mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                                 MPIR_ERR_RECOVERABLE,
                                                 __func__,
                                                 __LINE__,
                                                 MPI_ERR_ARG, "**arg", "**arg %s", "order");
                goto fn_fail;
            }

            tmp_size = 1;
            for (i = 0; mpi_errno == MPI_SUCCESS && i < ndims; i++) {
                MPIR_ERRTEST_ARGNONPOS(array_of_gsizes[i], "gsize", mpi_errno, MPI_ERR_ARG);
                MPIR_ERRTEST_ARGNONPOS(array_of_psizes[i], "psize", mpi_errno, MPI_ERR_ARG);

                if ((array_of_distribs[i] != MPI_DISTRIBUTE_NONE) &&
                    (array_of_distribs[i] != MPI_DISTRIBUTE_BLOCK) &&
                    (array_of_distribs[i] != MPI_DISTRIBUTE_CYCLIC)) {
                    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                                     MPIR_ERR_RECOVERABLE,
                                                     __func__,
                                                     __LINE__, MPI_ERR_ARG, "**darrayunknown", 0);
                    goto fn_fail;
                }

                if ((array_of_dargs[i] != MPI_DISTRIBUTE_DFLT_DARG) && (array_of_dargs[i] <= 0)) {
                    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                                     MPIR_ERR_RECOVERABLE,
                                                     __func__,
                                                     __LINE__,
                                                     MPI_ERR_ARG,
                                                     "**arg", "**arg %s", "array_of_dargs");
                    goto fn_fail;
                }

                if ((array_of_distribs[i] == MPI_DISTRIBUTE_NONE) && (array_of_psizes[i] != 1)) {
                    mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                                     MPIR_ERR_RECOVERABLE,
                                                     __func__,
                                                     __LINE__,
                                                     MPI_ERR_ARG,
                                                     "**darraydist",
                                                     "**darraydist %d %d", i, array_of_psizes[i]);
                    goto fn_fail;
                }

                tmp_size *= array_of_psizes[i];
            }

            MPIR_ERR_CHKANDJUMP1((tmp_size != size), mpi_errno, MPI_ERR_ARG,
                                 "**arg", "**arg %s", "array_of_psizes");

            /* TODO: GET THIS CHECK IN ALSO */

            /* check if MPI_Aint is large enough for size of global array.
             * if not, complain. */

            size_with_aint = orig_extent;
            for (i = 0; i < ndims; i++)
                size_with_aint *= array_of_gsizes[i];
            size_with_offset = orig_extent;
            for (i = 0; i < ndims; i++)
                size_with_offset *= array_of_gsizes[i];
            if (size_with_aint != size_with_offset) {
                mpi_errno = MPIR_Err_create_code(MPI_SUCCESS,
                                                 MPIR_ERR_FATAL,
                                                 __func__,
                                                 __LINE__,
                                                 MPI_ERR_ARG,
                                                 "**darrayoverflow",
                                                 "**darrayoverflow %L", size_with_offset);
                goto fn_fail;
            }

            /* Validate datatype_ptr */
            MPIR_Datatype_valid_ptr(datatype_ptr, mpi_errno);
            /* If datatype_ptr is not valid, it will be reset to null */
            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno)
                goto fn_fail;
            /* --END ERROR HANDLING-- */
        }
        MPID_END_ERROR_CHECKS;
    }
#endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ... */

/* calculate position in Cartesian grid as MPI would (row-major
   ordering) */
    MPIR_CHKLMEM_MALLOC_ORJUMP(coords, int *, ndims * sizeof(int), mpi_errno,
                               "position is Cartesian grid", MPL_MEM_COMM);

    procs = size;
    tmp_rank = rank;
    for (i = 0; i < ndims; i++) {
        procs = procs / array_of_psizes[i];
        coords[i] = tmp_rank / procs;
        tmp_rank = tmp_rank % procs;
    }

    MPIR_CHKLMEM_MALLOC_ORJUMP(st_offsets, MPI_Aint *, ndims * sizeof(MPI_Aint), mpi_errno,
                               "st_offsets", MPL_MEM_COMM);

    type_old = oldtype;

    if (order == MPI_ORDER_FORTRAN) {
        /* dimension 0 changes fastest */
        for (i = 0; i < ndims; i++) {
            switch (array_of_distribs[i]) {
                case MPI_DISTRIBUTE_BLOCK:
                    mpi_errno = MPIR_Type_block(array_of_gsizes,
                                                i,
                                                ndims,
                                                array_of_psizes[i],
                                                coords[i],
                                                array_of_dargs[i],
                                                order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    break;
                case MPI_DISTRIBUTE_CYCLIC:
                    mpi_errno = MPIR_Type_cyclic(array_of_gsizes,
                                                 i,
                                                 ndims,
                                                 array_of_psizes[i],
                                                 coords[i],
                                                 array_of_dargs[i],
                                                 order,
                                                 orig_extent, type_old, &type_new, st_offsets + i);
                    break;
                case MPI_DISTRIBUTE_NONE:
                    /* treat it as a block distribution on 1 process */
                    mpi_errno = MPIR_Type_block(array_of_gsizes,
                                                i,
                                                ndims,
                                                1,
                                                0,
                                                MPI_DISTRIBUTE_DFLT_DARG,
                                                order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    break;
            }
            if (i) {
                MPIR_Type_free_impl(&type_old);
            }
            type_old = type_new;

            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS)
                goto fn_fail;
            /* --END ERROR HANDLING-- */
        }

        /* add displacement and UB */
        disps[1] = st_offsets[0];
        tmp_size = 1;
        for (i = 1; i < ndims; i++) {
            tmp_size *= array_of_gsizes[i - 1];
            disps[1] += (MPI_Aint) tmp_size *st_offsets[i];
        }
        /* rest done below for both Fortran and C order */
    }

    else {      /* order == MPI_ORDER_C */

        /* dimension ndims-1 changes fastest */
        for (i = ndims - 1; i >= 0; i--) {
            switch (array_of_distribs[i]) {
                case MPI_DISTRIBUTE_BLOCK:
                    mpi_errno = MPIR_Type_block(array_of_gsizes,
                                                i,
                                                ndims,
                                                array_of_psizes[i],
                                                coords[i],
                                                array_of_dargs[i],
                                                order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    break;
                case MPI_DISTRIBUTE_CYCLIC:
                    mpi_errno = MPIR_Type_cyclic(array_of_gsizes,
                                                 i,
                                                 ndims,
                                                 array_of_psizes[i],
                                                 coords[i],
                                                 array_of_dargs[i],
                                                 order,
                                                 orig_extent, type_old, &type_new, st_offsets + i);
                    break;
                case MPI_DISTRIBUTE_NONE:
                    /* treat it as a block distribution on 1 process */
                    mpi_errno = MPIR_Type_block(array_of_gsizes,
                                                i,
                                                ndims,
                                                array_of_psizes[i],
                                                coords[i],
                                                MPI_DISTRIBUTE_DFLT_DARG,
                                                order,
                                                orig_extent, type_old, &type_new, st_offsets + i);
                    break;
            }
            if (i != ndims - 1) {
                MPIR_Type_free_impl(&type_old);
            }
            type_old = type_new;

            /* --BEGIN ERROR HANDLING-- */
            if (mpi_errno != MPI_SUCCESS)
                goto fn_fail;
            /* --END ERROR HANDLING-- */
        }

        /* add displacement and UB */
        disps[1] = st_offsets[ndims - 1];
        tmp_size = 1;
        for (i = ndims - 2; i >= 0; i--) {
            tmp_size *= array_of_gsizes[i + 1];
            disps[1] += (MPI_Aint) tmp_size *st_offsets[i];
        }
    }

    disps[1] *= orig_extent;

    disps[2] = orig_extent;
    for (i = 0; i < ndims; i++)
        disps[2] *= (MPI_Aint) (array_of_gsizes[i]);

    disps[0] = 0;

/* Instead of using MPI_LB/MPI_UB, which have been removed from MPI in MPI-3,
   use MPI_Type_create_resized. Use hindexed_block to set the starting displacement
   of the datatype (disps[1]) and type_create_resized to set lb to 0 (disps[0])
   and extent to disps[2], which makes ub = disps[2].
 */
    mpi_errno = MPIR_Type_blockindexed(1, 1, &disps[1], 1,      /* 1 means disp is in bytes */
                                       type_new, &tmp_type);

    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;
    /* --END ERROR HANDLING-- */

    mpi_errno = MPIR_Type_create_resized(tmp_type, 0, disps[2], &new_handle);

    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;
    /* --END ERROR HANDLING-- */

    MPIR_Type_free_impl(&tmp_type);
    MPIR_Type_free_impl(&type_new);

    /* at this point we have the new type, and we've cleaned up any
     * intermediate types created in the process.  we just need to save
     * all our contents/envelope information.
     */

    /* Save contents */
    MPIR_CHKLMEM_MALLOC_ORJUMP(ints, int *, (4 * ndims + 4) * sizeof(int), mpi_errno,
                               "content description", MPL_MEM_BUFFER);

    ints[0] = size;
    ints[1] = rank;
    ints[2] = ndims;

    for (i = 0; i < ndims; i++) {
        ints[i + 3] = array_of_gsizes[i];
    }
    for (i = 0; i < ndims; i++) {
        ints[i + ndims + 3] = array_of_distribs[i];
    }
    for (i = 0; i < ndims; i++) {
        ints[i + 2 * ndims + 3] = array_of_dargs[i];
    }
    for (i = 0; i < ndims; i++) {
        ints[i + 3 * ndims + 3] = array_of_psizes[i];
    }
    ints[4 * ndims + 3] = order;
    MPIR_Datatype_get_ptr(new_handle, datatype_ptr);
    mpi_errno = MPIR_Datatype_set_contents(datatype_ptr,
                                           MPI_COMBINER_DARRAY,
                                           4 * ndims + 4, 0, 1, ints, NULL, &oldtype);
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno != MPI_SUCCESS)
        goto fn_fail;
    /* --END ERROR HANDLING-- */

    MPIR_OBJ_PUBLISH_HANDLE(*newtype, new_handle);
    /* ... end of body of routine ... */

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_TYPE_CREATE_DARRAY);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
#ifdef HAVE_ERROR_CHECKING
    {
        mpi_errno =
            MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER,
                                 "**mpi_type_create_darray",
                                 "**mpi_type_create_darray %d %d %d %p %p %p %p %d %D %p", size,
                                 rank, ndims, array_of_gsizes, array_of_distribs, array_of_dargs,
                                 array_of_psizes, order, oldtype, newtype);
    }
#endif
    mpi_errno = MPIR_Err_return_comm(NULL, __func__, mpi_errno);
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemple #11
0
int MPIR_Exscan ( 
    const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op,
    MPIR_Comm *comm_ptr,
    MPIR_Errflag_t *errflag )
{
    MPI_Status status;
    int        rank, comm_size;
    int mpi_errno = MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;
    int mask, dst, is_commutative, flag; 
    MPI_Aint true_extent, true_lb, extent;
    void *partial_scan, *tmp_buf;
    MPIR_Op *op_ptr;
    MPIR_CHKLMEM_DECL(2);
    
    if (count == 0) return MPI_SUCCESS;

    comm_size = comm_ptr->local_size;
    rank = comm_ptr->rank;
    
    /* set op_errno to 0. stored in perthread structure */
    {
        MPIR_Per_thread_t *per_thread = NULL;
        int err = 0;

        MPID_THREADPRIV_KEY_GET_ADDR(MPIR_ThreadInfo.isThreaded, MPIR_Per_thread_key,
                                     MPIR_Per_thread, per_thread, &err);
        MPIR_Assert(err == 0);
        per_thread->op_errno = 0;
    }

    if (HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) {
        is_commutative = 1;
    }
    else {
        MPIR_Op_get_ptr(op, op_ptr);
        if (op_ptr->kind == MPIR_OP_KIND__USER_NONCOMMUTE)
            is_commutative = 0;
        else
            is_commutative = 1;
    }
    
    /* need to allocate temporary buffer to store partial scan*/
    MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);

    MPID_Datatype_get_extent_macro( datatype, extent );

    MPIR_CHKLMEM_MALLOC(partial_scan, void *, (count*(MPL_MAX(true_extent,extent))), mpi_errno, "partial_scan");
    /* adjust for potential negative lower bound in datatype */
    partial_scan = (void *)((char*)partial_scan - true_lb);

    /* need to allocate temporary buffer to store incoming data*/
    MPIR_CHKLMEM_MALLOC(tmp_buf, void *, (count*(MPL_MAX(true_extent,extent))), mpi_errno, "tmp_buf");
    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);

    mpi_errno = MPIR_Localcopy((sendbuf == MPI_IN_PLACE ? (const void *)recvbuf : sendbuf), count, datatype,
                               partial_scan, count, datatype);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

    flag = 0;
    mask = 0x1;
    while (mask < comm_size) {
        dst = rank ^ mask;
        if (dst < comm_size) {
            /* Send partial_scan to dst. Recv into tmp_buf */
            mpi_errno = MPIC_Sendrecv(partial_scan, count, datatype,
                                         dst, MPIR_EXSCAN_TAG, tmp_buf,
                                         count, datatype, dst,
                                         MPIR_EXSCAN_TAG, comm_ptr,
                                         &status, errflag);
            if (mpi_errno) {
                /* for communication errors, just record the error but continue */
                *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
                MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
                MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
            }

            if (rank > dst) {
		mpi_errno = MPIR_Reduce_local_impl( tmp_buf, partial_scan,
						    count, datatype, op );
                if (mpi_errno) MPIR_ERR_POP(mpi_errno);

                /* On rank 0, recvbuf is not defined.  For sendbuf==MPI_IN_PLACE
                   recvbuf must not change (per MPI-2.2).
                   On rank 1, recvbuf is to be set equal to the value
                   in sendbuf on rank 0.
                   On others, recvbuf is the scan of values in the
                   sendbufs on lower ranks. */ 
                if (rank != 0) {
                    if (flag == 0) {
                        /* simply copy data recd from rank 0 into recvbuf */
                        mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype,
                                                   recvbuf, count, datatype);
                        if (mpi_errno) MPIR_ERR_POP(mpi_errno);

                        flag = 1;
                    }
                    else {
			mpi_errno = MPIR_Reduce_local_impl( tmp_buf,
					    recvbuf, count, datatype, op );
                        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                    }
                }
            }
            else {
                if (is_commutative) {
		    mpi_errno = MPIR_Reduce_local_impl( tmp_buf, partial_scan,
							count, datatype, op );
                    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
		}
                else {
		    mpi_errno = MPIR_Reduce_local_impl( partial_scan,
						tmp_buf, count, datatype, op );
                    if (mpi_errno) MPIR_ERR_POP(mpi_errno);

                    mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype,
                                               partial_scan,
                                               count, datatype);
                    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
                }
            }
        }
        mask <<= 1;
    }

    {
        MPIR_Per_thread_t *per_thread = NULL;
        int err = 0;

        MPID_THREADPRIV_KEY_GET_ADDR(MPIR_ThreadInfo.isThreaded, MPIR_Per_thread_key,
                                     MPIR_Per_thread, per_thread, &err);
        MPIR_Assert(err == 0);

        if (per_thread->op_errno)
            mpi_errno = per_thread->op_errno;
    }

fn_exit:
    MPIR_CHKLMEM_FREEALL();
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIR_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
int MPIR_Allreduce_intra_recursive_doubling(
    const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op,
    MPIR_Comm * comm_ptr,
    MPIR_Errflag_t * errflag)
{
    MPIR_CHKLMEM_DECL(1);
#ifdef MPID_HAS_HETERO
    int is_homogeneous;
    int rc;
#endif
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;
    int mask, dst, is_commutative, pof2, newrank, rem, newdst;
    MPI_Aint true_extent, true_lb, extent;
    void *tmp_buf;

    comm_size = comm_ptr->local_size;
    rank = comm_ptr->rank;

    is_commutative = MPIR_Op_is_commutative(op);

    /* need to allocate temporary buffer to store incoming data*/
    MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);
    MPIR_Datatype_get_extent_macro(datatype, extent);

    MPIR_Ensure_Aint_fits_in_pointer(count * MPL_MAX(extent, true_extent));
    MPIR_CHKLMEM_MALLOC(tmp_buf, void *, count*(MPL_MAX(extent,true_extent)), mpi_errno, "temporary buffer", MPL_MEM_BUFFER);

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - true_lb);

    /* copy local data into recvbuf */
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno = MPIR_Localcopy(sendbuf, count, datatype, recvbuf,
                                   count, datatype);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }

    /* get nearest power-of-two less than or equal to comm_size */
    pof2 = comm_ptr->pof2;

    rem = comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end. The
       remaining processes form a nice power-of-two. */

    if (rank < 2*rem) {
        if (rank % 2 == 0) { /* even */
            mpi_errno = MPIC_Send(recvbuf, count,
                                     datatype, rank+1,
                                     MPIR_ALLREDUCE_TAG, comm_ptr, errflag);
            if (mpi_errno) {
                /* for communication errors, just record the error but continue */
                *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
                MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
                MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
            }

            /* temporarily set the rank to -1 so that this
               process does not pariticipate in recursive
               doubling */
            newrank = -1;
        }
        else { /* odd */
            mpi_errno = MPIC_Recv(tmp_buf, count,
                                     datatype, rank-1,
                                     MPIR_ALLREDUCE_TAG, comm_ptr,
                                     MPI_STATUS_IGNORE, errflag);
            if (mpi_errno) {
                /* for communication errors, just record the error but continue */
                *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
                MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
                MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
            }

            /* do the reduction on received data. since the
               ordering is right, it doesn't matter whether
               the operation is commutative or not. */
            mpi_errno = MPIR_Reduce_local(tmp_buf, recvbuf, count, datatype, op);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);

            /* change the rank */
            newrank = rank / 2;
        }
    }
    else  /* rank >= 2*rem */
        newrank = rank - rem;

    /* If op is user-defined or count is less than pof2, use
       recursive doubling algorithm. Otherwise do a reduce-scatter
       followed by allgather. (If op is user-defined,
       derived datatypes are allowed and the user could pass basic
       datatypes on one process and derived on another as long as
       the type maps are the same. Breaking up derived
       datatypes to do the reduce-scatter is tricky, therefore
       using recursive doubling in that case.) */

    if (newrank != -1) {
      mask = 0x1;
      while (mask < pof2) {
          newdst = newrank ^ mask;
          /* find real rank of dest */
          dst = (newdst < rem) ? newdst*2 + 1 : newdst + rem;

          /* Send the most current data, which is in recvbuf. Recv
             into tmp_buf */
          mpi_errno = MPIC_Sendrecv(recvbuf, count, datatype,
                                       dst, MPIR_ALLREDUCE_TAG, tmp_buf,
                                       count, datatype, dst,
                                       MPIR_ALLREDUCE_TAG, comm_ptr,
                                       MPI_STATUS_IGNORE, errflag);
          if (mpi_errno) {
              /* for communication errors, just record the error but continue */
              *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
              MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
              MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
          }

          /* tmp_buf contains data received in this step.
             recvbuf contains data accumulated so far */

          if (is_commutative  || (dst < rank)) {
              /* op is commutative OR the order is already right */
              mpi_errno = MPIR_Reduce_local(tmp_buf, recvbuf, count, datatype, op);
              if (mpi_errno) MPIR_ERR_POP(mpi_errno);
          }
          else {
              /* op is noncommutative and the order is not right */
              mpi_errno = MPIR_Reduce_local(recvbuf, tmp_buf, count, datatype, op);
              if (mpi_errno) MPIR_ERR_POP(mpi_errno);

              /* copy result back into recvbuf */
              mpi_errno = MPIR_Localcopy(tmp_buf, count, datatype,
                                         recvbuf, count, datatype);
              if (mpi_errno) MPIR_ERR_POP(mpi_errno);
          }
          mask <<= 1;
      }
    }
    /* In the non-power-of-two case, all odd-numbered
       processes of rank < 2*rem send the result to
       (rank-1), the ranks who didn't participate above. */
    if (rank < 2*rem) {
        if (rank % 2)  /* odd */
            mpi_errno = MPIC_Send(recvbuf, count,
                                     datatype, rank-1,
                                     MPIR_ALLREDUCE_TAG, comm_ptr, errflag);
        else  /* even */
            mpi_errno = MPIC_Recv(recvbuf, count,
                                     datatype, rank+1,
                                     MPIR_ALLREDUCE_TAG, comm_ptr,
                                     MPI_STATUS_IGNORE, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }
fn_exit:
    MPIR_CHKLMEM_FREEALL();
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
static int MPIR_Reduce_scatter_block_noncomm (
    const void *sendbuf,
    void *recvbuf,
    int recvcount,
    MPI_Datatype datatype,
    MPI_Op op,
    MPIR_Comm *comm_ptr,
    MPIR_Errflag_t *errflag )
{
    int mpi_errno = MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;
    int comm_size = comm_ptr->local_size;
    int rank = comm_ptr->rank;
    int pof2;
    int log2_comm_size;
    int i, k;
    int recv_offset, send_offset;
    int block_size, total_count, size;
    MPI_Aint true_extent, true_lb;
    int buf0_was_inout;
    void *tmp_buf0;
    void *tmp_buf1;
    void *result_ptr;
    MPIR_CHKLMEM_DECL(3);

    MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent);

    pof2 = 1;
    log2_comm_size = 0;
    while (pof2 < comm_size) {
        pof2 <<= 1;
        ++log2_comm_size;
    }

    /* begin error checking */
    MPIR_Assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */
    /* end error checking */

    /* size of a block (count of datatype per block, NOT bytes per block) */
    block_size = recvcount;
    total_count = block_size * comm_size;

    MPIR_CHKLMEM_MALLOC(tmp_buf0, void *, true_extent * total_count, mpi_errno, "tmp_buf0");
    MPIR_CHKLMEM_MALLOC(tmp_buf1, void *, true_extent * total_count, mpi_errno, "tmp_buf1");
    /* adjust for potential negative lower bound in datatype */
    tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb);
    tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb);

    /* Copy our send data to tmp_buf0.  We do this one block at a time and
       permute the blocks as we go according to the mirror permutation. */
    for (i = 0; i < comm_size; ++i) {
        mpi_errno = MPIR_Localcopy((char *)(sendbuf == MPI_IN_PLACE ? (const void *)recvbuf : sendbuf) + (i * true_extent * block_size),
                                   block_size, datatype,
                                   (char *)tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size, datatype);
        if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    }
    buf0_was_inout = 1;

    send_offset = 0;
    recv_offset = 0;
    size = total_count;
    for (k = 0; k < log2_comm_size; ++k) {
        /* use a double-buffering scheme to avoid local copies */
        char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0);
        char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1);
        int peer = rank ^ (0x1 << k);
        size /= 2;

        if (rank > peer) {
            /* we have the higher rank: send top half, recv bottom half */
            recv_offset += size;
        }
        else {
            /* we have the lower rank: recv top half, send bottom half */
            send_offset += size;
        }

        mpi_errno = MPIC_Sendrecv(outgoing_data + send_offset*true_extent,
                                     size, datatype, peer, MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                     incoming_data + recv_offset*true_extent,
                                     size, datatype, peer, MPIR_REDUCE_SCATTER_BLOCK_TAG,
                                     comm_ptr, MPI_STATUS_IGNORE, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
        /* always perform the reduction at recv_offset, the data at send_offset
           is now our peer's responsibility */
        if (rank > peer) {
            /* higher ranked value so need to call op(received_data, my_data) */
            mpi_errno = MPIR_Reduce_local_impl(
                     incoming_data + recv_offset*true_extent,
                     outgoing_data + recv_offset*true_extent,
                     size, datatype, op);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);
        }
        else {
            /* lower ranked value so need to call op(my_data, received_data) */
            mpi_errno = MPIR_Reduce_local_impl(
                     outgoing_data + recv_offset*true_extent,
                     incoming_data + recv_offset*true_extent,
                     size, datatype, op);
            if (mpi_errno) MPIR_ERR_POP(mpi_errno);
            buf0_was_inout = !buf0_was_inout;
        }

        /* the next round of send/recv needs to happen within the block (of size
           "size") that we just received and reduced */
        send_offset = recv_offset;
    }

    MPIR_Assert(size == recvcount);

    /* copy the reduced data to the recvbuf */
    result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
    mpi_errno = MPIR_Localcopy(result_ptr, size, datatype,
                               recvbuf, size, datatype);
    if (mpi_errno) MPIR_ERR_POP(mpi_errno);
    
fn_exit:
    MPIR_CHKLMEM_FREEALL();
    /* --BEGIN ERROR HANDLING-- */
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");
    /* --END ERROR HANDLING-- */
    return mpi_errno;
fn_fail:
    goto fn_exit;
}
Exemple #14
0
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info,
                                       MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    void **base_pp = (void **) base_ptr;
    int i, node_size, node_rank;
    MPIR_Comm *node_comm_ptr;
    MPI_Aint *node_sizes;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int noncontig = FALSE;
    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        mpi_errno =
            MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr);
        goto fn_exit;
    }

    /* see if we can allocate all windows contiguously */
    noncontig = (*win_ptr)->info_args.alloc_shared_noncontig;

    (*win_ptr)->shm_allocated = TRUE;

    /* When allocating shared memory region segment, we need comm of processes
     * that are on the same node as this process (node_comm).
     * If node_comm == NULL, this process is the only one on this node, therefore
     * we use comm_self as node comm. */
    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_size = node_comm_ptr->local_size;
    node_rank = node_comm_ptr->rank;

    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
    /* allocate memory for the base addresses, disp_units, and
     * completion counters of all processes */
    MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **,
                        node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs");

    /* get the sizes of the windows and window objectsof
     * all processes.  allocate temp. buffer for communication */
    MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno,
                        "node_sizes");

    /* FIXME: This needs to be fixed for heterogeneous systems */
    node_sizes[node_rank] = (MPI_Aint) size;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                    node_sizes, sizeof(MPI_Aint), MPI_BYTE,
                                    node_comm_ptr, &errflag);
    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    (*win_ptr)->shm_segment_len = 0;

    for (i = 0; i < node_size; i++) {
        if (noncontig)
            /* Round up to next page size */
            (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]);
        else
            (*win_ptr)->shm_segment_len += node_sizes[i];
    }

    if ((*win_ptr)->shm_segment_len == 0) {
        (*win_ptr)->base = NULL;
    }

    else {
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle,
                                                (*win_ptr)->shm_segment_len,
                                                (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len,
                                     (char **) &(*win_ptr)->shm_base_addr, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* Allocated the interprocess mutex segment. */
        mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        if (node_rank == 0) {
            char *serialized_hnd_ptr = NULL;

            /* create shared memory region for all processes in win and map */
            mpi_errno =
                MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle,
                                                sizeof(MPIDI_CH3I_SHM_MUTEX),
                                                (char **) &(*win_ptr)->shm_mutex, 0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr);

            /* serialize handle and broadcast it to the other processes in win */
            mpi_errno =
                MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle,
                                                    &serialized_hnd_ptr);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* wait for other processes to attach to win */
            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            /* unlink shared memory region so it gets deleted when all processes exit */
            mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
        }
        else {
            char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

            /* get serialized handle from rank 0 and deserialize it */
            mpi_errno =
                MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                                &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

            mpi_errno =
                MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd,
                                          strlen(serialized_hnd));
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            /* attach to shared memory region created by rank 0 */
            mpi_errno =
                MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle,
                                     sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex,
                                     0);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);

            mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
            if (mpi_errno)
                MPIR_ERR_POP(mpi_errno);
            MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
        }

        /* compute the base addresses of each process within the shared memory segment */
        {
            char *cur_base;
            int cur_rank;

            cur_base = (*win_ptr)->shm_base_addr;
            cur_rank = 0;
            ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr;
            for (i = 1; i < node_size; ++i) {
                if (node_sizes[i]) {
                    /* For the base addresses, we track the previous
                     * process that has allocated non-zero bytes of shared
                     * memory.  We can not simply use "i-1" for the
                     * previous process because rank "i-1" might not have
                     * allocated any memory. */
                    if (noncontig) {
                        ((*win_ptr)->shm_base_addrs)[i] =
                            cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]);
                    }
                    else {
                        ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank];
                    }
                    cur_base = ((*win_ptr)->shm_base_addrs)[i];
                    cur_rank = i;
                }
                else {
                    ((*win_ptr)->shm_base_addrs)[i] = NULL;
                }
            }
        }

        (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank];
    }

    *base_pp = (*win_ptr)->base;

    /* gather window information among processes via shared memory region. */
    mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    /* Cache SHM windows */
    MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr));

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemple #15
0
static int MPIDI_CH3I_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info,
                                      MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
{
    MPIR_Comm *node_comm_ptr = NULL;
    int node_rank;
    int comm_rank, comm_size;
    MPI_Aint *tmp_buf = NULL;
    int i, k;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    int mpi_errno = MPI_SUCCESS;
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);

    if ((*win_ptr)->comm_ptr->node_comm == NULL) {
        mpi_errno = MPIDI_CH3U_Win_gather_info(base, size, disp_unit, info, comm_ptr, win_ptr);
        goto fn_exit;
    }

    comm_size = (*win_ptr)->comm_ptr->local_size;
    comm_rank = (*win_ptr)->comm_ptr->rank;

    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_rank = node_comm_ptr->rank;

    (*win_ptr)->info_shm_segment_len = comm_size * sizeof(MPIDI_Win_basic_info_t);

    mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->info_shm_segment_handle);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    if (node_rank == 0) {
        char *serialized_hnd_ptr = NULL;

        /* create shared memory region for all processes in win and map. */
        mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->info_shm_segment_handle,
                                                    (*win_ptr)->info_shm_segment_len,
                                                    (char **) &(*win_ptr)->info_shm_base_addr, 0);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* serialize handle and broadcast it to the other processes in win */
        mpi_errno =
            MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->info_shm_segment_handle,
                                                &serialized_hnd_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno =
            MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                            &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* wait for other processes to attach to win */
        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        /* unlink shared memory region so it gets deleted when all processes exit */
        mpi_errno = MPL_shm_seg_remove((*win_ptr)->info_shm_segment_handle);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
    }
    else {
        char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 };

        /* get serialized handle from rank 0 and deserialize it */
        mpi_errno =
            MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr,
                            &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->info_shm_segment_handle, serialized_hnd,
                                              strlen(serialized_hnd));
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        /* attach to shared memory region created by rank 0 */
        mpi_errno =
            MPL_shm_seg_attach((*win_ptr)->info_shm_segment_handle,
                                 (*win_ptr)->info_shm_segment_len,
                                 (char **) &(*win_ptr)->info_shm_base_addr, 0);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");
    }

    (*win_ptr)->basic_info_table = (MPIDI_Win_basic_info_t *) ((*win_ptr)->info_shm_base_addr);

    MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                        mpi_errno, "tmp_buf");

    tmp_buf[4 * comm_rank] = MPIR_Ptr_to_aint(base);
    tmp_buf[4 * comm_rank + 1] = size;
    tmp_buf[4 * comm_rank + 2] = (MPI_Aint) disp_unit;
    tmp_buf[4 * comm_rank + 3] = (MPI_Aint) (*win_ptr)->handle;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT,
                                    (*win_ptr)->comm_ptr, &errflag);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    if (node_rank == 0) {
        /* only node_rank == 0 writes results to basic_info_table on shared memory region. */
        k = 0;
        for (i = 0; i < comm_size; i++) {
            (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]);
            (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
            (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
            (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
        }
    }

    /* Make sure that all local processes see the results written by node_rank == 0 */
    mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_GATHER_INFO);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
/*@
   MPI_Type_create_hindexed - Create a datatype for an indexed datatype with
   displacements in bytes

Input Parameters:
+ count - number of blocks --- also number of entries in
  array_of_displacements and array_of_blocklengths (integer)
. array_of_blocklengths - number of elements in each block (array of nonnegative integers)
. array_of_displacements - byte displacement of each block (array of address integers)
- oldtype - old datatype (handle)

Output Parameters:
. newtype - new datatype (handle)

.N ThreadSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_TYPE
.N MPI_ERR_ARG
@*/
int MPI_Type_create_hindexed(int count,
			     const int array_of_blocklengths[],
			     const MPI_Aint array_of_displacements[],
			     MPI_Datatype oldtype,
			     MPI_Datatype *newtype)
{
    static const char FCNAME[] = "MPI_Type_create_hindexed";
    int mpi_errno = MPI_SUCCESS;
    MPI_Datatype new_handle;
    MPIR_Datatype *new_dtp;
    int i, *ints;
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_TYPE_CREATE_HINDEXED);

    MPIR_ERRTEST_INITIALIZED_ORDIE();

    MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_TYPE_CREATE_HINDEXED);

#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    int j;
	    MPIR_Datatype *datatype_ptr = NULL;

	    MPIR_ERRTEST_COUNT(count, mpi_errno);
	    if (count > 0) {
		MPIR_ERRTEST_ARGNULL(array_of_blocklengths, "array_of_blocklengths", mpi_errno);
		MPIR_ERRTEST_ARGNULL(array_of_displacements, "array_of_displacements", mpi_errno);
	    }

	    MPIR_ERRTEST_DATATYPE(oldtype, "datatype", mpi_errno);

	    if (HANDLE_GET_KIND(oldtype) != HANDLE_KIND_BUILTIN) {
		MPIR_Datatype_get_ptr(oldtype, datatype_ptr);
		MPIR_Datatype_valid_ptr(datatype_ptr, mpi_errno);
                if (mpi_errno != MPI_SUCCESS) goto fn_fail;
	    }
	    for (j=0; j < count; j++) {
		MPIR_ERRTEST_ARGNEG(array_of_blocklengths[j], "blocklength", mpi_errno);
	    }
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ... */

    mpi_errno = MPIR_Type_indexed(count,
				  array_of_blocklengths,
				  array_of_displacements,
				  1, /* displacements in bytes */
				  oldtype,
				  &new_handle);

    if (mpi_errno != MPI_SUCCESS) goto fn_fail;

    MPIR_CHKLMEM_MALLOC_ORJUMP(ints, int *, (count + 1) * sizeof(int), mpi_errno, "content description", MPL_MEM_BUFFER);

    ints[0] = count;

    for (i=0; i < count; i++)
    {
	ints[i+1] = array_of_blocklengths[i];
    }
    MPIR_Datatype_get_ptr(new_handle, new_dtp);
    mpi_errno = MPIR_Datatype_set_contents(new_dtp,
				           MPI_COMBINER_HINDEXED,
				           count+1, /* ints (count, blocklengths) */
				           count, /* aints (displacements) */
				           1, /* types */
				           ints,
				           array_of_displacements,
				           &oldtype);

    if (mpi_errno != MPI_SUCCESS) goto fn_fail;

    MPIR_OBJ_PUBLISH_HANDLE(*newtype, new_handle);
    /* ... end of body of routine ... */

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_TYPE_CREATE_HINDEXED);
    MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    return mpi_errno;

  fn_fail:
    /* --BEGIN ERROR HANDLING-- */
#   ifdef HAVE_ERROR_CHECKING
    {
	mpi_errno = MPIR_Err_create_code(
	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_type_create_hindexed",
	    "**mpi_type_create_hindexed %d %p %p %D %p", count, array_of_blocklengths, array_of_displacements, oldtype, newtype);
    }
#   endif
    mpi_errno = MPIR_Err_return_comm(NULL, FCNAME, mpi_errno);
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
Exemple #17
0
static int MPIDI_CH3I_SHM_Wins_match(MPIR_Win ** win_ptr, MPIR_Win ** matched_win,
                                     MPI_Aint ** base_shm_offs_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int i, comm_size;
    int node_size, node_rank, shm_node_size;
    int group_diff;
    int base_diff;

    MPIR_Comm *node_comm_ptr = NULL, *shm_node_comm_ptr = NULL;
    int *node_ranks = NULL, *node_ranks_in_shm_node = NULL;
    MPIR_Group *node_group_ptr = NULL, *shm_node_group_ptr = NULL;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPI_Aint *base_shm_offs;

    MPIDI_SHM_Win_t *elem = shm_wins_list;

    MPIR_CHKLMEM_DECL(2);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);
    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);

    *matched_win = NULL;
    base_shm_offs = *base_shm_offs_ptr;
    node_comm_ptr = (*win_ptr)->comm_ptr->node_comm;
    MPIR_Assert(node_comm_ptr != NULL);
    node_size = node_comm_ptr->local_size;
    node_rank = node_comm_ptr->rank;

    comm_size = (*win_ptr)->comm_ptr->local_size;

    MPIR_CHKLMEM_MALLOC(node_ranks, int *, node_size * sizeof(int), mpi_errno, "node_ranks");
    MPIR_CHKLMEM_MALLOC(node_ranks_in_shm_node, int *, node_size * sizeof(int),
                        mpi_errno, "node_ranks_in_shm_comm");

    for (i = 0; i < node_size; i++) {
        node_ranks[i] = i;
    }

    mpi_errno = MPIR_Comm_group_impl(node_comm_ptr, &node_group_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    while (elem != NULL) {
        MPIR_Win *shm_win = elem->win;
        if (!shm_win)
            MPIDI_SHM_Wins_next_and_continue(elem);

        /* Compare node_comm.
         *
         * Only support shm if new node_comm is equal to or a subset of shm node_comm.
         * Shm node_comm == a subset of node_comm is not supported, because it means
         * some processes of node_comm cannot be shared, but RMA operation simply checks
         * the node_id of a target process for distinguishing shm target.  */
        shm_node_comm_ptr = shm_win->comm_ptr->node_comm;
        shm_node_size = shm_node_comm_ptr->local_size;

        if (node_size > shm_node_size)
            MPIDI_SHM_Wins_next_and_continue(elem);

        mpi_errno = MPIR_Comm_group_impl(shm_win->comm_ptr, &shm_node_group_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Group_translate_ranks_impl(node_group_ptr, node_size,
                                                    node_ranks, shm_node_group_ptr,
                                                    node_ranks_in_shm_node);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);

        mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        shm_node_group_ptr = NULL;

        group_diff = 0;
        for (i = 0; i < node_size; i++) {
            /* not exist in shm_comm->node_comm */
            if (node_ranks_in_shm_node[i] == MPI_UNDEFINED) {
                group_diff = 1;
                break;
            }
        }
        if (group_diff)
            MPIDI_SHM_Wins_next_and_continue(elem);

        /* Gather the offset of base_addr from all local processes. Match only
         * when all of them are included in the shm segment in current shm_win.
         *
         * Note that this collective call must be called after checking the
         * group match in order to guarantee all the local processes can perform
         * this call. */
        base_shm_offs[node_rank] = (MPI_Aint) ((*win_ptr)->base)
            - (MPI_Aint) (shm_win->shm_base_addr);
        mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                        base_shm_offs, 1, MPI_AINT, node_comm_ptr, &errflag);
        if (mpi_errno)
            MPIR_ERR_POP(mpi_errno);
        MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

        base_diff = 0;
        for (i = 0; i < comm_size; ++i) {
            int i_node_rank = (*win_ptr)->comm_ptr->intranode_table[i];
            if (i_node_rank >= 0) {
                MPIR_Assert(i_node_rank < node_size);

                if (base_shm_offs[i_node_rank] < 0 ||
                    base_shm_offs[i_node_rank] + (*win_ptr)->basic_info_table[i].size >
                    shm_win->shm_segment_len) {
                    base_diff = 1;
                    break;
                }
            }
        }

        if (base_diff)
            MPIDI_SHM_Wins_next_and_continue(elem);

        /* Found the first matched shm_win */
        *matched_win = shm_win;
        break;
    }

  fn_exit:
    if (node_group_ptr != NULL)
        mpi_errno = MPIR_Group_free_impl(node_group_ptr);
    /* Only free it here when group_translate_ranks fails. */
    if (shm_node_group_ptr != NULL)
        mpi_errno = MPIR_Group_free_impl(shm_node_group_ptr);

    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_SHM_WINS_MATCH);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}
int MPIR_Allgather_intra_brucks (
    const void *sendbuf,
    int sendcount,
    MPI_Datatype sendtype,
    void *recvbuf,
    int recvcount,
    MPI_Datatype recvtype,
    MPIR_Comm *comm_ptr,
    MPIR_Errflag_t *errflag )
{
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mpi_errno_ret = MPI_SUCCESS;
    MPI_Aint   recvtype_extent;
    MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb;
    int pof2, src, rem;
    void *tmp_buf = NULL;
    int curr_cnt, dst;

    MPIR_CHKLMEM_DECL(1);

    if (((sendcount == 0) && (sendbuf != MPI_IN_PLACE)) || (recvcount == 0))
        return MPI_SUCCESS;

    comm_size = comm_ptr->local_size;
    rank = comm_ptr->rank;

    MPIR_Datatype_get_extent_macro( recvtype, recvtype_extent );

    /* This is the largest offset we add to recvbuf */
    MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf +
				     (comm_size * recvcount * recvtype_extent));

    /* allocate a temporary buffer of the same size as recvbuf. */

    /* get true extent of recvtype */
    MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent);
            
    recvbuf_extent = recvcount * comm_size * (MPL_MAX(recvtype_true_extent, recvtype_extent));

    MPIR_CHKLMEM_MALLOC(tmp_buf, void*, recvbuf_extent, mpi_errno, "tmp_buf", MPL_MEM_BUFFER);
            
    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb);

    /* copy local data to the top of tmp_buf */ 
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno = MPIR_Localcopy (sendbuf, sendcount, sendtype,
                                    tmp_buf, recvcount, recvtype);
        if (mpi_errno) { 
            MPIR_ERR_POP(mpi_errno);
        }
    }
    else {
        mpi_errno = MPIR_Localcopy (((char *)recvbuf +
                                     rank * recvcount * recvtype_extent), 
                                     recvcount, recvtype, tmp_buf, 
                                     recvcount, recvtype);
	    if (mpi_errno) { 
            MPIR_ERR_POP(mpi_errno);
	    }
    }
        
    /* do the first \floor(\lg p) steps */

    curr_cnt = recvcount;
    pof2 = 1;
    while (pof2 <= comm_size/2) {
        src = (rank + pof2) % comm_size;
        dst = (rank - pof2 + comm_size) % comm_size;
            
        mpi_errno = MPIC_Sendrecv(tmp_buf, curr_cnt, recvtype, dst,
                                     MPIR_ALLGATHER_TAG,
                                     ((char *)tmp_buf + curr_cnt*recvtype_extent),
                                     curr_cnt, recvtype,
                                     src, MPIR_ALLGATHER_TAG, comm_ptr,
                                     MPI_STATUS_IGNORE, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
        curr_cnt *= 2;
        pof2 *= 2;
    }

    /* if comm_size is not a power of two, one more step is needed */

    rem = comm_size - pof2;
    if (rem) {
        src = (rank + pof2) % comm_size;
        dst = (rank - pof2 + comm_size) % comm_size;
        
        mpi_errno = MPIC_Sendrecv(tmp_buf, rem * recvcount, recvtype,
                                     dst, MPIR_ALLGATHER_TAG,
                                     ((char *)tmp_buf + curr_cnt*recvtype_extent),
                                     rem * recvcount, recvtype,
                                     src, MPIR_ALLGATHER_TAG, comm_ptr,
                                     MPI_STATUS_IGNORE, errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag = MPIR_ERR_GET_CLASS(mpi_errno);
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    /* Rotate blocks in tmp_buf down by (rank) blocks and store
     * result in recvbuf. */
        
    mpi_errno = MPIR_Localcopy(tmp_buf, (comm_size-rank)*recvcount,
                    recvtype, (char *) recvbuf + rank*recvcount*recvtype_extent, 
                                   (comm_size-rank)*recvcount, recvtype);
	if (mpi_errno) { 
	    MPIR_ERR_POP(mpi_errno);
	}

    if (rank) {
        mpi_errno = MPIR_Localcopy((char *) tmp_buf + 
                               (comm_size-rank)*recvcount*recvtype_extent, 
                                   rank*recvcount, recvtype, recvbuf,
                                   rank*recvcount, recvtype);
        if (mpi_errno) { 
            MPIR_ERR_POP(mpi_errno);
	    }
    }

 fn_exit:
    MPIR_CHKLMEM_FREEALL();
    if (mpi_errno_ret)
        mpi_errno = mpi_errno_ret;
    else if (*errflag != MPIR_ERR_NONE)
        MPIR_ERR_SET(mpi_errno, *errflag, "**coll_fail");

    return mpi_errno;

 fn_fail:
    goto fn_exit;
}
Exemple #19
0
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit,
                               MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank;
    MPI_Aint *tmp_buf;
    MPIR_Errflag_t errflag = MPIR_ERR_NONE;
    MPIR_CHKPMEM_DECL(1);
    MPIR_CHKLMEM_DECL(1);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);

    MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);

    comm_size = (*win_ptr)->comm_ptr->local_size;
    rank = (*win_ptr)->comm_ptr->rank;

    MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather);
    /* allocate memory for the base addresses, disp_units, and
     * completion counters of all processes */
    MPIR_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *,
                        comm_size * sizeof(MPIDI_Win_basic_info_t),
                        mpi_errno, "(*win_ptr)->basic_info_table");

    /* get the addresses of the windows, window objects, and completion
     * counters of all processes.  allocate temp. buffer for communication */
    MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint),
                        mpi_errno, "tmp_buf");

    /* FIXME: This needs to be fixed for heterogeneous systems */
    /* FIXME: If we wanted to validate the transfer as within range at the
     * origin, we'd also need the window size. */
    tmp_buf[4 * rank] = MPIR_Ptr_to_aint(base);
    tmp_buf[4 * rank + 1] = size;
    tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit;
    tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle;

    mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL,
                                    tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag);
    MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather);
    if (mpi_errno) {
        MPIR_ERR_POP(mpi_errno);
    }
    MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail");

    k = 0;
    for (i = 0; i < comm_size; i++) {
        (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]);
        (*win_ptr)->basic_info_table[i].size = tmp_buf[k++];
        (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++];
        (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++];
    }

  fn_exit:
    MPIR_CHKLMEM_FREEALL();
    MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO);
    return mpi_errno;
    /* --BEGIN ERROR HANDLING-- */
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
    /* --END ERROR HANDLING-- */
}