Example #1
0
int MPID_Abort(MPIR_Comm * comm, int mpi_errno, int exit_code, const char *error_msg)
{
    char sys_str[MPI_MAX_ERROR_STRING + 5] = "";
    char comm_str[MPI_MAX_ERROR_STRING] = "";
    char world_str[MPI_MAX_ERROR_STRING] = "";
    char error_str[2 * MPI_MAX_ERROR_STRING + 128];
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_ABORT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_ABORT);

    if (MPIR_Process.comm_world) {
        int rank = MPIR_Process.comm_world->rank;
        snprintf(world_str, sizeof(world_str), " on node %d", rank);
    }

    if (comm) {
        int rank = comm->rank;
        int context_id = comm->context_id;
        snprintf(comm_str, sizeof(comm_str), " (rank %d in comm %d)", rank, context_id);
    }

    if (!error_msg)
        error_msg = "Internal error";

    if (mpi_errno != MPI_SUCCESS) {
        char msg[MPI_MAX_ERROR_STRING] = "";
        MPIR_Err_get_string(mpi_errno, msg, MPI_MAX_ERROR_STRING, NULL);
        snprintf(sys_str, sizeof(msg), " (%s)", msg);
    }
    MPL_snprintf(error_str, sizeof(error_str), "Abort(%d)%s%s: %s%s\n",
                 exit_code, world_str, comm_str, error_msg, sys_str);
    MPL_error_printf("%s", error_str);

    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_ABORT);
    fflush(stderr);
    fflush(stdout);
    if (NULL == comm || (MPIR_Comm_size(comm) == 1 && comm->comm_kind == MPIR_COMM_KIND__INTRACOMM))
        MPL_exit(exit_code);

    if (comm != MPIR_Process.comm_world) {
        MPIDIG_comm_abort(comm, exit_code);
    } else {
#ifdef USE_PMIX_API
        PMIx_Abort(exit_code, error_msg, NULL, 0);
#elif defined(USE_PMI2_API)
        PMI2_Abort(TRUE, error_msg);
#else
        PMI_Abort(exit_code, error_msg);
#endif
    }
    return 0;
}
Example #2
0
int MPIDU_bc_allgather(MPIR_Comm * comm, int *nodemap, void *bc, int bc_len, int same_len,
                       void **bc_table, size_t ** bc_indices)
{
    int mpi_errno = MPI_SUCCESS;
    int local_rank = -1, local_leader = -1;
    int rank = MPIR_Comm_rank(comm), size = MPIR_Comm_size(comm);

    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (!same_len) {
        bc_len *= 2;
        *bc_indices = indices;
    }

    MPIR_NODEMAP_get_local_info(rank, size, nodemap, &local_size, &local_rank, &local_leader);
    if (rank != local_leader) {
        size_t start = local_leader - nodemap[comm->rank] + (local_rank - 1);

        memcpy(&segment[start * bc_len], bc, bc_len);
    }

    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (rank == local_leader) {
        MPIR_Errflag_t errflag = MPIR_ERR_NONE;
        MPIR_Comm *allgather_comm = comm->node_roots_comm ? comm->node_roots_comm : comm;

        MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, segment, (local_size - 1) * bc_len,
                            MPI_BYTE, allgather_comm, &errflag);
    }

    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    *bc_table = segment;

  fn_exit:
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Example #3
0
/*@

MPI_Graph_map - Maps process to graph topology information

Input Parameters:
+ comm - input communicator (handle) 
. nnodes - number of graph nodes (integer) 
. index - integer array specifying the graph structure, see 'MPI_GRAPH_CREATE' 
- edges - integer array specifying the graph structure 

Output Parameter:
. newrank - reordered rank of the calling process; 'MPI_UNDEFINED' if the 
calling process does not belong to graph (integer) 
 
.N fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_TOPOLOGY
.N MPI_ERR_COMM
.N MPI_ERR_ARG
@*/
EXPORT_MPI_API int MPI_Graph_map ( MPI_Comm comm_old, int nnodes, int *index, int *edges, 
		    int *newrank )
{
  int rank, size;
  int mpi_errno = MPI_SUCCESS;
  struct MPIR_COMMUNICATOR *comm_old_ptr;
  static char myname[] = "MPI_GRAPH_MAP";

  TR_PUSH(myname);
  comm_old_ptr = MPIR_GET_COMM_PTR(comm_old);

#ifndef MPIR_NO_ERROR_CHECKING
  MPIR_TEST_MPI_COMM(comm_old,comm_old_ptr,comm_old_ptr,myname);
  if (nnodes < 1) mpi_errno = MPI_ERR_ARG;
  MPIR_TEST_ARG(newrank);
  MPIR_TEST_ARG(index);
  MPIR_TEST_ARG(edges);
  if (mpi_errno)
      return MPIR_ERROR(comm_old_ptr, mpi_errno, myname );
#endif
  
  /* Test that the communicator is large enough */
  MPIR_Comm_size( comm_old_ptr, &size );
  if (size < nnodes) {
      return MPIR_ERROR( comm_old_ptr, MPI_ERR_ARG, myname );
  }

  /* Am I in this topology? */
  MPIR_Comm_rank ( comm_old_ptr, &rank );
  if ( rank < nnodes )
    (*newrank) = rank;
  else
    (*newrank) = MPI_UNDEFINED;

  TR_POP;
  return (mpi_errno);
}
Example #4
0
static int intra_Scatter(void *sendbuf,
                         int sendcnt,
                         struct MPIR_DATATYPE *sendtype,
                         void *recvbuf,
                         int recvcnt,
                         struct MPIR_DATATYPE *recvtype,
                         int root, struct MPIR_COMMUNICATOR *comm)
{
    MPI_Status status;
    MPI_Aint extent;
    int rank, size, i;
    int mpi_errno = MPI_SUCCESS;
    static char myname[] = "MPI_SCATTER";

    /* Get size and rank */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &rank);

    /* Check for invalid arguments */
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (root < 0)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname,
                                    (char *) 0, (char *) 0, root);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* Switch communicators to the hidden collective */
    comm = comm->comm_coll;

    /* Get the size of the send type */
    MPI_Type_extent(sendtype->self, &extent);

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* If I'm the root, send messages to the rest of 'em */
    if (rank == root) {
        for (i = 0; i < root; i++) {
            mpi_errno =
                MPI_Send((void *) ((char *) sendbuf +
                                   i * sendcnt * extent), sendcnt,
                         sendtype->self, i, MPIR_SCATTER_TAG, comm->self);
            if (mpi_errno)
                return mpi_errno;
        }

        mpi_errno =
            MPI_Sendrecv((void *) ((char *) sendbuf +
                                   rank * sendcnt * extent), sendcnt,
                         sendtype->self, rank, MPIR_SCATTER_TAG, recvbuf,
                         recvcnt, recvtype->self, rank, MPIR_SCATTER_TAG,
                         comm->self, &status);
        if (mpi_errno)
            return mpi_errno;

        for (i = root + 1; i < size; i++) {
            mpi_errno =
                MPI_Send((void *) ((char *) sendbuf +
                                   i * sendcnt * extent), sendcnt,
                         sendtype->self, i, MPIR_SCATTER_TAG, comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
    } else
        mpi_errno = MPI_Recv(recvbuf, recvcnt, recvtype->self, root,
                             MPIR_SCATTER_TAG, comm->self, &status);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #5
0
static int intra_Gatherv(void *sendbuf,
                         int sendcnt,
                         struct MPIR_DATATYPE *sendtype,
                         void *recvbuf,
                         int *recvcnts,
                         int *displs,
                         struct MPIR_DATATYPE *recvtype,
                         int root, struct MPIR_COMMUNICATOR *comm)
{
    int size, rank;
    int mpi_errno = MPI_SUCCESS;
    static char myname[] = "MPI_GATHERV";

    /* Is root within the communicator? */
    MPIR_Comm_size(comm, &size);
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (root < 0)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname,
                                    (char *) 0, (char *) 0, root);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* Get my rank and switch communicators to the hidden collective */
    MPIR_Comm_rank(comm, &rank);
    comm = comm->comm_coll;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* If rank == root, then I recv lots, otherwise I send */
    if (rank == root) {
        MPI_Aint extent;
        int i;
        MPI_Request req;
        MPI_Status status;

        mpi_errno = MPI_Isend(sendbuf, sendcnt, sendtype->self, root,
                              MPIR_GATHERV_TAG, comm->self, &req);
        if (mpi_errno)
            return mpi_errno;
        MPI_Type_extent(recvtype->self, &extent);
        for (i = 0; i < size; i++) {
            mpi_errno =
                MPI_Recv((void *) ((char *) recvbuf + displs[i] * extent),
                         recvcnts[i], recvtype->self, i, MPIR_GATHERV_TAG,
                         comm->self, &status);
            if (mpi_errno)
                return mpi_errno;
        }
        mpi_errno = MPI_Wait(&req, &status);
    } else
        mpi_errno = MPI_Send(sendbuf, sendcnt, sendtype->self, root,
                             MPIR_GATHERV_TAG, comm->self);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #6
0
static int intra_Gather(void *sendbuf,
                        int sendcnt,
                        struct MPIR_DATATYPE *sendtype,
                        void *recvbuf,
                        int recvcount,
                        struct MPIR_DATATYPE *recvtype,
                        int root, struct MPIR_COMMUNICATOR *comm)
{
    int size, rank;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint extent;            /* Datatype extent */
    static char myname[] = "MPI_GATHER";

    /* Is root within the communicator? */
    MPIR_Comm_size(comm, &size);
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (root < 0)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname,
                                    (char *) 0, (char *) 0, root);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* Get my rank and switch communicators to the hidden collective */
    MPIR_Comm_rank(comm, &rank);
    comm = comm->comm_coll;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* If rank == root, then I recv lots, otherwise I send */
    /* This should use the same mechanism used in reduce; the intermediate nodes
       will need to allocate space. 
     */
    if (rank == root) {
        int i;
        MPI_Request req;
        MPI_Status status;

        /* This should really be COPYSELF.... , with the for look skipping
           root. */
        mpi_errno = MPI_Isend(sendbuf, sendcnt, sendtype->self, root,
                              MPIR_GATHER_TAG, comm->self, &req);
        if (mpi_errno)
            return mpi_errno;
        MPI_Type_extent(recvtype->self, &extent);
        for (i = 0; i < size; i++) {
            mpi_errno =
                MPI_Recv((void *) (((char *) recvbuf) +
                                   i * extent * recvcount), recvcount,
                         recvtype->self, i, MPIR_GATHER_TAG, comm->self,
                         &status);
            if (mpi_errno)
                return mpi_errno;
        }
        mpi_errno = MPI_Wait(&req, &status);
    } else
        mpi_errno = MPI_Send(sendbuf, sendcnt, sendtype->self, root,
                             MPIR_GATHER_TAG, comm->self);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #7
0
static int intra_Bcast(void *buffer,
                       int count,
                       struct MPIR_DATATYPE *datatype,
                       int root, struct MPIR_COMMUNICATOR *comm)
{
    MPI_Status status;
    int rank, size, src, dst;
    int relative_rank, mask;
    int mpi_errno = MPI_SUCCESS;
    static char myname[] = "MPI_BCAST";

    /* See the overview in Collection Operations for why this is ok */
    if (count == 0)
        return MPI_SUCCESS;

    /* Is root within the comm and more than 1 processes involved? */
    MPIR_Comm_size(comm, &size);
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* If there is only one process */
    if (size == 1)
        return (mpi_errno);

    /* Get my rank and switch communicators to the hidden collective */
    MPIR_Comm_rank(comm, &rank);
    comm = comm->comm_coll;

    /* Algorithm:
       This uses a fairly basic recursive subdivision algorithm.
       The root sends to the process size/2 away; the receiver becomes
       a root for a subtree and applies the same process. 

       So that the new root can easily identify the size of its
       subtree, the (subtree) roots are all powers of two (relative to the root)
       If m = the first power of 2 such that 2^m >= the size of the
       communicator, then the subtree at root at 2^(m-k) has size 2^k
       (with special handling for subtrees that aren't a power of two in size).

       Optimizations:

       The original code attempted to switch to a linear broadcast when
       the subtree size became too small.  As a further variation, the subtree
       broadcast sent data to the center of the block, rather than to one end.
       However, the original code did not properly compute the communications,
       resulting in extraneous (though harmless) communication.    

       For very small messages, using a linear algorithm (process 0 sends to
       process 1, who sends to 2, etc.) can be better, since no one process
       takes more than 1 send/recv time, and successive bcasts using the same
       root can overlap.  

       Another important technique for long messages is pipelining---sending
       the messages in blocks so that the message can be pipelined through
       the network without waiting for the subtree roots to receive the entire
       message before forwarding it to other processors.  This is hard to
       do if the datatype/count are not the same on each processor (note that
       this is allowed - only the signatures must match).  Of course, this can
       be accomplished at the byte transfer level, but it is awkward 
       from the MPI point-to-point routines.

       Nonblocking operations can be used to achieve some "horizontal"
       pipelining (on some systems) by allowing multiple send/receives
       to begin on the same processor.
     */

    relative_rank = (rank >= root) ? rank - root : rank - root + size;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* Do subdivision.  There are two phases:
       1. Wait for arrival of data.  Because of the power of two nature
       of the subtree roots, the source of this message is alwyas the
       process whose relative rank has the least significant bit CLEARED.
       That is, process 4 (100) receives from process 0, process 7 (111) 
       from process 6 (110), etc.   
       2. Forward to my subtree

       Note that the process that is the tree root is handled automatically
       by this code, since it has no bits set.

     */
    mask = 0x1;
    while (mask < size) {
        if (relative_rank & mask) {
            src = rank - mask;
            if (src < 0)
                src += size;
            mpi_errno = MPI_Recv(buffer, count, datatype->self, src,
                                 MPIR_BCAST_TAG, comm->self, &status);
            if (mpi_errno)
                return mpi_errno;
            break;
        }
        mask <<= 1;
    }

    /* This process is responsible for all processes that have bits set from
       the LSB upto (but not including) mask.  Because of the "not including",
       we start by shifting mask back down one.

       We can easily change to a different algorithm at any power of two
       by changing the test (mask > 1) to (mask > block_size) 

       One such version would use non-blocking operations for the last 2-4
       steps (this also bounds the number of MPI_Requests that would
       be needed).
     */
    mask >>= 1;
    while (mask > 0) {
        if (relative_rank + mask < size) {
            dst = rank + mask;
            if (dst >= size)
                dst -= size;
            mpi_errno = MPI_Send(buffer, count, datatype->self, dst,
                                 MPIR_BCAST_TAG, comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
        mask >>= 1;
    }

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #8
0
static int intra_Reduce(void *sendbuf,
                        void *recvbuf,
                        int count,
                        struct MPIR_DATATYPE *datatype,
                        MPI_Op op,
                        int root, struct MPIR_COMMUNICATOR *comm)
{
    MPI_Status status;
    int size, rank;
    int mask, relrank, source, lroot;
    int mpi_errno = MPI_SUCCESS;
    MPI_User_function *uop;
    MPI_Aint lb, ub, m_extent;  /* Extent in memory */
    void *buffer;
    struct MPIR_OP *op_ptr;
    static char myname[] = "MPI_REDUCE";
    MPIR_ERROR_DECL;
    mpi_comm_err_ret = 0;

    /* Is root within the communicator? */
    MPIR_Comm_size(comm, &size);
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (root < 0)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname,
                                    (char *) 0, (char *) 0, root);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* See the overview in Collection Operations for why this is ok */
    if (count == 0)
        return MPI_SUCCESS;

    /* If the operation is predefined, we could check that the datatype's
       type signature is compatible with the operation.  
     */
#ifdef MPID_Reduce
    /* Eventually, this could apply the MPID_Reduce routine in a loop for
       counts > 1 */
    if (comm->ADIReduce && count == 1) {
        /* Call a routine to sort through the datatypes and operations ...
           This allows us to provide partial support (e.g., only SUM_DOUBLE)
         */
        if (MPIR_ADIReduce(comm->ADIctx, comm, sendbuf, recvbuf, count,
                           datatype->self, op, root) == MPI_SUCCESS)
            return MPI_SUCCESS;
    }
#endif

    /* Get my rank and switch communicators to the hidden collective */
    MPIR_Comm_rank(comm, &rank);
    comm = comm->comm_coll;
    op_ptr = MPIR_GET_OP_PTR(op);
    MPIR_TEST_MPI_OP(op, op_ptr, comm, myname);
    uop = op_ptr->op;


    /* Here's the algorithm.  Relative to the root, look at the bit pattern in 
       my rank.  Starting from the right (lsb), if the bit is 1, send to 
       the node with that bit zero and exit; if the bit is 0, receive from the
       node with that bit set and combine (as long as that node is within the
       group)

       Note that by receiving with source selection, we guarentee that we get
       the same bits with the same input.  If we allowed the parent to receive 
       the children in any order, then timing differences could cause different
       results (roundoff error, over/underflows in some cases, etc).

       Because of the way these are ordered, if root is 0, then this is correct
       for both commutative and non-commutitive operations.  If root is not
       0, then for non-commutitive, we use a root of zero and then send
       the result to the root.  To see this, note that the ordering is
       mask = 1: (ab)(cd)(ef)(gh)            (odds send to evens)
       mask = 2: ((ab)(cd))((ef)(gh))        (3,6 send to 0,4)
       mask = 4: (((ab)(cd))((ef)(gh)))      (4 sends to 0)

       Comments on buffering.  
       If the datatype is not contiguous, we still need to pass contiguous 
       data to the user routine.  
       In this case, we should make a copy of the data in some format, 
       and send/operate on that.

       In general, we can't use MPI_PACK, because the alignment of that
       is rather vague, and the data may not be re-usable.  What we actually
       need is a "squeeze" operation that removes the skips.
     */
    /* Make a temporary buffer */
    MPIR_Type_get_limits(datatype, &lb, &ub);
    m_extent = ub - lb;
    /* MPI_Type_extent ( datatype, &extent ); */
    MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm,
               MPI_ERR_EXHAUSTED, "MPI_REDUCE");
    buffer = (void *) ((char *) buffer - lb);

    /* If I'm not the root, then my recvbuf may not be valid, therefore
       I have to allocate a temporary one */
    if (rank != root) {
        MPIR_ALLOC(recvbuf, (void *) MALLOC(m_extent * count),
                   comm, MPI_ERR_EXHAUSTED, "MPI_REDUCE");
        recvbuf = (void *) ((char *) recvbuf - lb);
    }

    /* This code isn't correct if the source is a more complex datatype */
    memcpy(recvbuf, sendbuf, m_extent * count);
    mask = 0x1;
    if (op_ptr->commute)
        lroot = root;
    else
        lroot = 0;
    relrank = (rank - lroot + size) % size;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    MPIR_Op_errno = MPI_SUCCESS;
    while ( /*(mask & relrank) == 0 && */ mask < size) {
        /* Receive */
        if ((mask & relrank) == 0) {
            source = (relrank | mask);
            if (source < size) {
                source = (source + lroot) % size;
                mpi_errno = MPI_Recv(buffer, count, datatype->self, source,
                                     MPIR_REDUCE_TAG, comm->self, &status);
                if (mpi_errno)
                    return MPIR_ERROR(comm, mpi_errno, myname);
                /* The sender is above us, so the received buffer must be
                   the second argument (in the noncommutitive case). */
                /* error pop/push allows errors found by predefined routines
                   to be visible.  We need a better way to do this */
                /* MPIR_ERROR_POP(comm); */
                if (op_ptr->commute)
                    (*uop) (buffer, recvbuf, &count, &datatype->self);
                else {
                    (*uop) (recvbuf, buffer, &count, &datatype->self);
                    /* short term hack to keep recvbuf up-to-date */
                    memcpy(recvbuf, buffer, m_extent * count);
                }
                /* MPIR_ERROR_PUSH(comm); */
            }
        } else {
            /* I've received all that I'm going to.  Send my result to 
               my parent */
            source = ((relrank & (~mask)) + lroot) % size;
            mpi_errno = MPI_Send(recvbuf, count, datatype->self,
                                 source, MPIR_REDUCE_TAG, comm->self);
            if (mpi_errno)
                return MPIR_ERROR(comm, mpi_errno, myname);
            break;
        }
        mask <<= 1;
    }
    FREE((char *) buffer + lb);
    if (!op_ptr->commute && root != 0) {
        if (rank == 0) {
            mpi_errno = MPI_Send(recvbuf, count, datatype->self, root,
                                 MPIR_REDUCE_TAG, comm->self);
        } else if (rank == root) {
            mpi_errno = MPI_Recv(recvbuf, count, datatype->self, 0,     /*size-1, */
                                 MPIR_REDUCE_TAG, comm->self, &status);
        }
    }

    /* Free the temporarily allocated recvbuf */
    if (rank != root)
        FREE((char *) recvbuf + lb);

    /* If the predefined operation detected an error, report it here */
    /* Note that only the root gets this result, so this can cause
       programs to hang, particularly if this is used to implement 
       MPI_Allreduce.  Use care with this.
     */
    if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) {
        /* PRINTF( "Error in performing MPI_Op in reduce\n" ); */
        mpi_errno = MPIR_Op_errno;
    }

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #9
0
static int intra_Alltoall(void *sendbuf,
                          int sendcount,
                          struct MPIR_DATATYPE *sendtype,
                          void *recvbuf,
                          int recvcnt,
                          struct MPIR_DATATYPE *recvtype,
                          struct MPIR_COMMUNICATOR *comm)
{
    int size, i, j;
    int me;
    MPI_Aint send_extent, recv_extent;
    int mpi_errno = MPI_SUCCESS;
    MPI_Status *starray;
    MPI_Request *reqarray;
    static char myname[] = "MPI_ALLTOALL";

    /* Get size and switch to collective communicator */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &me);
    comm = comm->comm_coll;

    /* Get extent of send and recv types */
    MPI_Type_extent(sendtype->self, &send_extent);
    MPI_Type_extent(recvtype->self, &recv_extent);

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* 
     */
    /* 1st, get some storage from the heap to hold handles, etc. */
    MPIR_ALLOC(starray,
               (MPI_Status *) MALLOC(2 * size * sizeof(MPI_Status)), comm,
               MPI_ERR_EXHAUSTED, myname);

    MPIR_ALLOC(reqarray,
               (MPI_Request *) MALLOC(2 * size * sizeof(MPI_Request)),
               comm, MPI_ERR_EXHAUSTED, myname);

    /* do the communication -- post *all* sends and receives: */
    /* 
       ServerNet Optimization.  Post all receives then synchronously 
       cycle through all of the sends,
     */
    for (i = 0; i < size; i++) {
        /* We'd like to avoid sending and receiving to ourselves; 
           however, this is complicated by the presence of different
           sendtype and recvtypes. */
        if ((mpi_errno = MPI_Irecv((void *) ((char *) recvbuf +
                                             (((i +
                                                me) % size) * recvcnt *
                                              recv_extent)), recvcnt,
                                   recvtype->self, ((i + me) % size),
                                   MPIR_ALLTOALL_TAG, comm->self,
                                   &reqarray[i]))
            )
            break;
    }
    for (i = 0; i < size; i++) {
        MPI_Barrier(comm->self);
        if ((mpi_errno = MPI_Send((void *) ((char *) sendbuf +
                                            ((i +
                                              me) % size) * sendcount *
                                            send_extent), sendcount,
                                  sendtype->self, ((i + me) % size),
                                  MPIR_ALLTOALL_TAG, comm->self))
            )
            break;
    }

    if (mpi_errno)
        return mpi_errno;

    /* ... then wait for *all* of them to finish: */
    mpi_errno = MPI_Waitall(size, reqarray, starray);
    if (mpi_errno == MPI_ERR_IN_STATUS) {
        for (j = 0; j < size; j++) {
            if (starray[j].MPI_ERROR != MPI_SUCCESS)
                mpi_errno = starray[j].MPI_ERROR;
        }
    }

    /* clean up */
    FREE(starray);
    FREE(reqarray);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #10
0
static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPIR_Info * info,
                    MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr)
{
    int mpi_errno = MPI_SUCCESS;
    int i;
    MPIR_Comm *win_comm_ptr;
    int win_target_pool_size;
    MPIR_CHKPMEM_DECL(5);
    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_WIN_INIT);

    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_WIN_INIT);

    MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);
    if (initRMAoptions) {

        MPIDI_CH3_RMA_Init_sync_pvars();
        MPIDI_CH3_RMA_Init_pkthandler_pvars();

        initRMAoptions = 0;
    }
    MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX);

    *win_ptr = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem);
    MPIR_ERR_CHKANDJUMP1(!(*win_ptr), mpi_errno, MPI_ERR_OTHER, "**nomem",
                         "**nomem %s", "MPIR_Win_mem");

    mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    MPIR_Object_set_ref(*win_ptr, 1);

    /* (*win_ptr)->errhandler is set by upper level; */
    /* (*win_ptr)->base is set by caller; */
    (*win_ptr)->size = size;
    (*win_ptr)->disp_unit = disp_unit;
    (*win_ptr)->create_flavor = create_flavor;
    (*win_ptr)->model = model;
    (*win_ptr)->attributes = NULL;
    (*win_ptr)->comm_ptr = win_comm_ptr;

    (*win_ptr)->at_completion_counter = 0;
    (*win_ptr)->shm_base_addrs = NULL;
    /* (*win_ptr)->basic_info_table[] is set by caller; */
    (*win_ptr)->current_lock_type = MPID_LOCK_NONE;
    (*win_ptr)->shared_lock_ref_cnt = 0;
    (*win_ptr)->target_lock_queue_head = NULL;
    (*win_ptr)->shm_allocated = FALSE;
    (*win_ptr)->states.access_state = MPIDI_RMA_NONE;
    (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE;
    (*win_ptr)->num_targets_with_pending_net_ops = 0;
    (*win_ptr)->start_ranks_in_win_grp = NULL;
    (*win_ptr)->start_grp_size = 0;
    (*win_ptr)->lock_all_assert = 0;
    (*win_ptr)->lock_epoch_count = 0;
    (*win_ptr)->outstanding_locks = 0;
    (*win_ptr)->current_target_lock_data_bytes = 0;
    (*win_ptr)->sync_request_cnt = 0;
    (*win_ptr)->active = FALSE;
    (*win_ptr)->next = NULL;
    (*win_ptr)->prev = NULL;
    (*win_ptr)->outstanding_acks = 0;

    /* Initialize the info flags */
    (*win_ptr)->info_args.no_locks = 0;
    (*win_ptr)->info_args.accumulate_ordering = MPIDI_ACC_ORDER_RAR | MPIDI_ACC_ORDER_RAW |
        MPIDI_ACC_ORDER_WAR | MPIDI_ACC_ORDER_WAW;
    (*win_ptr)->info_args.accumulate_ops = MPIDI_ACC_OPS_SAME_OP_NO_OP;
    (*win_ptr)->info_args.same_size = 0;
    (*win_ptr)->info_args.same_disp_unit = FALSE;
    (*win_ptr)->info_args.alloc_shared_noncontig = 0;
    (*win_ptr)->info_args.alloc_shm = FALSE;
    if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE ||
        (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) {
        (*win_ptr)->info_args.alloc_shm = TRUE;
    }

    /* Set info_args on window based on info provided by user */
    mpi_errno = MPID_Win_set_info((*win_ptr), info);
    if (mpi_errno != MPI_SUCCESS)
        MPIR_ERR_POP(mpi_errno);

    MPIR_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *,
                        sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno,
                        "RMA op pool", MPL_MEM_RMA);
    (*win_ptr)->op_pool_head = NULL;
    for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) {
        (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
        DL_APPEND((*win_ptr)->op_pool_head, &((*win_ptr)->op_pool_start[i]));
    }

    win_target_pool_size =
        MPL_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr));
    MPIR_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *,
                        sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno,
                        "RMA target pool", MPL_MEM_RMA);
    (*win_ptr)->target_pool_head = NULL;
    for (i = 0; i < win_target_pool_size; i++) {
        (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN;
        DL_APPEND((*win_ptr)->target_pool_head, &((*win_ptr)->target_pool_start[i]));
    }

    (*win_ptr)->num_slots = MPL_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr));
    MPIR_CHKPMEM_MALLOC((*win_ptr)->slots, MPIDI_RMA_Slot_t *,
                        sizeof(MPIDI_RMA_Slot_t) * (*win_ptr)->num_slots, mpi_errno, "RMA slots",
                        MPL_MEM_RMA);
    for (i = 0; i < (*win_ptr)->num_slots; i++) {
        (*win_ptr)->slots[i].target_list_head = NULL;
    }

    MPIR_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start,
                        MPIDI_RMA_Target_lock_entry_t *,
                        sizeof(MPIDI_RMA_Target_lock_entry_t) *
                        MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno,
                        "RMA lock entry pool", MPL_MEM_RMA);
    (*win_ptr)->target_lock_entry_pool_head = NULL;
    for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) {
        DL_APPEND((*win_ptr)->target_lock_entry_pool_head,
                      &((*win_ptr)->target_lock_entry_pool_start[i]));
    }

    if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) {
        /* this is the first window, register RMA progress hook */
        mpi_errno = MPID_Progress_register_hook(MPIDI_CH3I_RMA_Make_progress_global,
                                                &MPIDI_CH3I_RMA_Progress_hook_id);
        if (mpi_errno) {
            MPIR_ERR_POP(mpi_errno);
        }
    }

    DL_APPEND(MPIDI_RMA_Win_inactive_list_head, (*win_ptr));

    if (MPIDI_CH3U_Win_hooks.win_init != NULL) {
        mpi_errno =
            MPIDI_CH3U_Win_hooks.win_init(size, disp_unit, create_flavor, model, info, comm_ptr,
                                          win_ptr);
        if (mpi_errno != MPI_SUCCESS)
            MPIR_ERR_POP(mpi_errno);
    }

  fn_exit:
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_WIN_INIT);
    return mpi_errno;
  fn_fail:
    MPIR_CHKPMEM_REAP();
    goto fn_exit;
}
Example #11
0
int MPIR_intra_Scan ( void *sendbuf, void *recvbuf, int count, 
		      struct MPIR_DATATYPE *datatype, MPI_Op op, 
		      struct MPIR_COMMUNICATOR *comm )
{
  MPI_Status status;
  int        rank, size;
  int        mpi_errno = MPI_SUCCESS;
  MPI_Aint   lb, ub, m_extent;  /* Extent in memory */
  MPI_User_function   *uop;
  struct MPIR_OP *op_ptr;
  MPIR_ERROR_DECL;

  int dd; /* displacement, no of hops to send (power of 2) */
  int rr; /* "round rank" */
  void *tmpbuf;

  mpi_comm_err_ret = 0;

  /* Nov. 98: Improved O(log(size)) algorithm */

  /* See the overview in Collection Operations for why this is ok */
  if (count == 0) return MPI_SUCCESS;

  /* Get my rank & size and switch communicators to the hidden collective */
  MPIR_Comm_size ( comm, &size );
  MPIR_Comm_rank ( comm, &rank );
  MPIR_Type_get_limits( datatype, &lb, &ub );
  m_extent = ub - lb;
  comm	   = comm->comm_coll;
  op_ptr = MPIR_GET_OP_PTR(op);
  MPIR_TEST_MPI_OP(op,op_ptr,comm,"MPI_SCAN");
  uop	   = op_ptr->op;

  /* Lock for collective operation */
  MPID_THREAD_DS_LOCK(comm);

  MPIR_Op_errno = MPI_SUCCESS;

  if (rank>0) {
    /* allocate temporary receive buffer
       (needed both in commutative and noncommutative case) */
    MPIR_ALLOC(tmpbuf,(void *)MALLOC(m_extent * count),
               comm, MPI_ERR_EXHAUSTED, "Out of space in MPI_SCAN" );
    tmpbuf = (void *)((char*)tmpbuf-lb);
  }
  MPIR_COPYSELF( sendbuf, count, datatype->self, recvbuf,
                 MPIR_SCAN_TAG, rank, comm->self );

  /* compute partial scans */
  rr = rank; dd = 1;
  while ((rr&1)==1) {
    /* odd "round rank"s receive */

    mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd,
                         MPIR_SCAN_TAG,comm->self,&status);
    if (mpi_errno) return mpi_errno;
#ifdef WIN32
    if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self);
    else
#endif
    (*uop)(tmpbuf, recvbuf, &count, &datatype->self);

    dd <<= 1; /* dd*2 */
    rr >>= 1; /* rr/2 */

    /* Invariant: recvbuf contains the scan of
       (rank-dd)+1, (rank-dd)+2,..., rank */
  }
  /* rr even, rank==rr*dd+dd-1, recvbuf contains the scan of
     rr*dd, rr*dd+1,..., rank */

  /* send partial scan forwards */
  if (rank+dd<size) {
    mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG,
                         comm->self);
    if (mpi_errno) return mpi_errno;
  }

  if (rank-dd>=0) {
    mpi_errno = MPI_Recv(tmpbuf,count,datatype->self,rank-dd,
                         MPIR_SCAN_TAG,comm->self,&status);
    if (mpi_errno) return mpi_errno;
#ifdef WIN32
    if(op_ptr->stdcall) op_ptr->op_s(tmpbuf, recvbuf, &count, &datatype->self);
    else
#endif
    (*uop)(tmpbuf, recvbuf, &count, &datatype->self);
    /* recvbuf contains the scan of 0,..., rank */
  }

  /* send result forwards */
  do {
    dd >>= 1; /* dd/2 */
  } while (rank+dd>=size);
  while (dd>0) {
    mpi_errno = MPI_Send(recvbuf,count,datatype->self,rank+dd,MPIR_SCAN_TAG,
                         comm->self);
    if (mpi_errno) return mpi_errno;
    dd >>= 1; /* dd/2 */
  }

  if (rank>0) {
    /* free temporary receive buffer */
    FREE((char*)tmpbuf+lb);
  }

  /* If the predefined operation detected an error, report it here */
  if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno)
      mpi_errno = MPIR_Op_errno;

  /* Unlock for collective operation */
  MPID_THREAD_DS_UNLOCK(comm);

  return(mpi_errno);
}
Example #12
0
int MPIR_intra_Scan ( void *sendbuf, void *recvbuf, int count, 
		      struct MPIR_DATATYPE *datatype, MPI_Op op, 
		      struct MPIR_COMMUNICATOR *comm )
{
  MPI_Status status;
  int        rank, size;
  int        mpi_errno = MPI_SUCCESS;
  MPI_User_function   *uop;
  struct MPIR_OP *op_ptr;
  int mask, dst; 
  MPI_Aint extent, lb;
  void *partial_scan, *tmp_buf;
  static char myname[] = "MPI_SCAN";

  if (count == 0) return MPI_SUCCESS;

  MPIR_Comm_size(comm, &size);
  MPIR_Comm_rank(comm, &rank);

  /* Switch communicators to the hidden collective */
  comm = comm->comm_coll;
 
  /* Lock for collective operation */
  MPID_THREAD_LOCK(comm->ADIctx,comm);

  op_ptr = MPIR_GET_OP_PTR(op);
  MPIR_TEST_MPI_OP(op,op_ptr,comm,myname);
  uop  = op_ptr->op;

  /* need to allocate temporary buffer to store partial scan*/
  MPI_Type_extent(datatype->self, &extent);
  MPIR_ALLOC(partial_scan,(void *)MALLOC(count*extent), comm,
             MPI_ERR_EXHAUSTED, myname);
  /* adjust for potential negative lower bound in datatype */
  MPI_Type_lb( datatype->self, &lb );
  partial_scan = (void *)((char*)partial_scan - lb);

  /* need to allocate temporary buffer to store incoming data*/
  MPIR_ALLOC(tmp_buf,(void *)MALLOC(count*extent), comm,
             MPI_ERR_EXHAUSTED, myname);
  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - lb);

  /* Since this is an inclusive scan, copy local contribution into
     recvbuf. */
  mpi_errno = MPI_Sendrecv ( sendbuf, count, datatype->self,
                             rank, MPIR_SCAN_TAG, 
                             recvbuf, count, datatype->self,
                             rank, MPIR_SCAN_TAG,
                             comm->self, &status );
  if (mpi_errno) return mpi_errno;

  mpi_errno = MPI_Sendrecv ( sendbuf, count, datatype->self,
                             rank, MPIR_SCAN_TAG, 
                             partial_scan, count, datatype->self,
                             rank, MPIR_SCAN_TAG,
                             comm->self, &status );
  if (mpi_errno) return mpi_errno;

  mask = 0x1;
  while (mask < size) {
      dst = rank ^ mask;
      if (dst < size) {
          /* Send partial_scan to dst. Recv into tmp_buf */
          mpi_errno = MPI_Sendrecv(partial_scan, count, datatype->self,
                                   dst, MPIR_SCAN_TAG, tmp_buf,
                                   count, datatype->self, dst,
                                   MPIR_SCAN_TAG, comm->self,
                                   &status); 
          if (mpi_errno) return mpi_errno;
          
          if (rank > dst) {
              (*uop)(tmp_buf, partial_scan, &count, &datatype->self);
              (*uop)(tmp_buf, recvbuf, &count, &datatype->self);
          }
          else {
              if (op_ptr->commute)
                  (*uop)(tmp_buf, partial_scan, &count, &datatype->self);
              else {
                  (*uop)(partial_scan, tmp_buf, &count, &datatype->self);
                  mpi_errno = MPI_Sendrecv(tmp_buf, count, datatype->self,
                                           rank, MPIR_SCAN_TAG, partial_scan,
                                           count, datatype->self, rank,
                                           MPIR_SCAN_TAG, comm->self,
                                           &status); 
                  if (mpi_errno) return mpi_errno;
              }
          }
      }
      mask <<= 1;
  }
  
  FREE((char *)partial_scan+lb); 
  FREE((char *)tmp_buf+lb); 
  
  /* Unlock for collective operation */
  MPID_THREAD_UNLOCK(comm->ADIctx,comm);

  return (mpi_errno);
}
Example #13
0
/*@

MPI_Comm_size - Determines the size of the group associated with a communicator

Input Parameters:
. comm - communicator (handle) 

Output Parameters:
. size - number of processes in the group of 'comm'  (integer) 

Notes:

.N NULL

.N SignalSafe

.N Fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_COMM
.N MPI_ERR_ARG
@*/
int MPI_Comm_size( MPI_Comm comm, int *size ) 
{
    int mpi_errno = MPI_SUCCESS;
    MPID_Comm *comm_ptr = 0;
    MPID_MPI_STATE_DECL(MPID_STATE_MPI_COMM_SIZE);

    MPIR_ERRTEST_INITIALIZED_ORDIE();
    
    MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_COMM_SIZE);
    
    /* Validate parameters, especially handles needing to be converted */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPIR_ERRTEST_COMM(comm, mpi_errno);
	}
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* Convert MPI object handles to object pointers */
    MPID_Comm_get_ptr( comm, comm_ptr );
    
    /* Validate parameters and objects (post conversion) */
#   ifdef HAVE_ERROR_CHECKING
    {
        MPID_BEGIN_ERROR_CHECKS;
        {
	    MPIR_ERRTEST_ARGNULL(size,"size",mpi_errno);
            /* Validate comm_ptr */
            MPID_Comm_valid_ptr( comm_ptr, mpi_errno );
	    /* If comm_ptr is not valid, it will be reset to null */
            if (mpi_errno) goto fn_fail;
        }
        MPID_END_ERROR_CHECKS;
    }
#   endif /* HAVE_ERROR_CHECKING */

    /* ... body of routine ...  */
    
    *size = MPIR_Comm_size(comm_ptr);
    
    /* ... end of body of routine ... */

#ifdef HAVE_ERROR_CHECKING
  fn_exit:
#endif
    MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_COMM_SIZE);
    return mpi_errno;
    
    /* --BEGIN ERROR HANDLING-- */
#   ifdef HAVE_ERROR_CHECKING
  fn_fail:
    {
	mpi_errno = MPIR_Err_create_code(
	    mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, 
	    "**mpi_comm_size",
	    "**mpi_comm_size %C %p", comm, size);
    }
    mpi_errno = MPIR_Err_return_comm( comm_ptr, FCNAME, mpi_errno );
    goto fn_exit;
#   endif
    /* --END ERROR HANDLING-- */
}
Example #14
0
/* This function produces topology aware trees for reduction and broadcasts, with different
 * K values. This is a heavy-weight function as it allocates shared memory, generates topology
 * information, builds a package-level tree (for package leaders), and a per-package tree.
 * These are combined in shared memory for other ranks to read out from.
 * */
int MPIDI_SHM_topology_tree_init(MPIR_Comm * comm_ptr, int root, int bcast_k,
                                 MPIR_Treealgo_tree_t * bcast_tree, int *bcast_topotree_fail,
                                 int reduce_k, MPIR_Treealgo_tree_t * reduce_tree,
                                 int *reduce_topotree_fail, MPIR_Errflag_t * errflag)
{
    int *shared_region;
    MPL_shm_hnd_t fd;
    int num_ranks, rank;
    int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS;
    size_t shm_size;
    int **bind_map = NULL;
    int *max_entries_per_level = NULL;
    int **ranks_per_package = NULL;
    int *package_ctr = NULL;
    size_t topo_depth = 0;
    int package_level = 0, i, max_ranks_per_package = 0;
    bool mapfail_flag = false;

    MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);
    MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);

    num_ranks = MPIR_Comm_size(comm_ptr);
    rank = MPIR_Comm_rank(comm_ptr);

    /* Calculate the size of shared memory that would be needed */
    shm_size = sizeof(int) * 5 * num_ranks + num_ranks * sizeof(cpu_set_t);

    /* STEP 1. Create shared memory region for exchanging topology information (root only) */
    mpi_errno = MPIDIU_allocate_shm_segment(comm_ptr, shm_size, &fd, (void **) &shared_region,
                                            &mapfail_flag);
    if (mpi_errno || mapfail_flag) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* STEP 2. Collect cpu_sets for each rank at the root */
    cpu_set_t my_cpu_set;
    CPU_ZERO(&my_cpu_set);
    sched_getaffinity(0, sizeof(my_cpu_set), &my_cpu_set);
    ((cpu_set_t *) (shared_region))[rank] = my_cpu_set;
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* STEP 3. Root has all the cpu_set information, now build tree */
    if (rank == root) {
        topo_depth = hwloc_topology_get_depth(MPIR_Process.hwloc_topology);
        bind_map = (int **) MPL_malloc(num_ranks * sizeof(int *), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!bind_map, mpi_errno, MPI_ERR_OTHER, "**nomem");
        for (i = 0; i < num_ranks; ++i) {
            bind_map[i] = (int *) MPL_calloc(topo_depth, sizeof(int), MPL_MEM_OTHER);
            MPIR_ERR_CHKANDJUMP(!bind_map[i], mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        MPIDI_SHM_hwloc_init_bindmap(num_ranks, topo_depth, shared_region, bind_map);
        /* Done building the topology information */

        /* STEP 3.1. Count the maximum entries at each level - used for breaking the tree into
         * intra/inter socket */
        max_entries_per_level = (int *) MPL_calloc(topo_depth, sizeof(size_t), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!max_entries_per_level, mpi_errno, MPI_ERR_OTHER, "**nomem");
        package_level =
            MPIDI_SHM_topotree_get_package_level(topo_depth, max_entries_per_level, num_ranks,
                                                 bind_map);
        if (MPIDI_SHM_TOPOTREE_DEBUG)
            fprintf(stderr, "Breaking topology at :: %d (default= %d)\n", package_level,
                    MPIDI_SHM_TOPOTREE_CUTOFF);

        /* STEP 3.2. allocate space for the entries that go in each package based on hwloc info */
        ranks_per_package =
            (int
             **) MPL_malloc(max_entries_per_level[package_level] * sizeof(int *), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!ranks_per_package, mpi_errno, MPI_ERR_OTHER, "**nomem");
        package_ctr =
            (int *) MPL_calloc(max_entries_per_level[package_level], sizeof(int), MPL_MEM_OTHER);
        MPIR_ERR_CHKANDJUMP(!package_ctr, mpi_errno, MPI_ERR_OTHER, "**nomem");
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            package_ctr[i] = 0;
            ranks_per_package[i] = (int *) MPL_calloc(num_ranks, sizeof(int), MPL_MEM_OTHER);
            MPIR_ERR_CHKANDJUMP(!ranks_per_package[i], mpi_errno, MPI_ERR_OTHER, "**nomem");
        }
        /* sort the ranks into packages based on the binding information */
        for (i = 0; i < num_ranks; ++i) {
            int package = bind_map[i][package_level];
            ranks_per_package[package][package_ctr[package]++] = i;
        }
        max_ranks_per_package = 0;
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            max_ranks_per_package = MPL_MAX(max_ranks_per_package, package_ctr[i]);
        }
        /* At this point we have done the common work in extracting topology information
         * and restructuring it to our needs. Now we generate the tree. */

        /* For Bcast, package leaders are added before the package local ranks, and the per_package
         * tree is left_skewed */
        mpi_errno = MPIDI_SHM_gen_tree(bcast_k, shared_region, max_entries_per_level,
                                       ranks_per_package, max_ranks_per_package, package_ctr,
                                       package_level, num_ranks, 1 /*package_leaders_first */ ,
                                       0 /*left_skewed */ , errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag =
                MPIX_ERR_PROC_FAILED ==
                MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }

    /* Every rank copies their tree out from shared memory */
    MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, bcast_tree, bcast_topotree_fail);
    if (MPIDI_SHM_TOPOTREE_DEBUG)
        MPIDI_SHM_print_topotree_file("BCAST", comm_ptr->context_id, rank, bcast_tree);

    /* Wait until shared memory is available */
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* Generate the reduce tree */
    /* For Reduce, package leaders are added after the package local ranks, and the per_package
     * tree is right_skewed (children are added in the reverse order */
    if (rank == root) {
        memset(shared_region, 0, shm_size);
        mpi_errno = MPIDI_SHM_gen_tree(reduce_k, shared_region, max_entries_per_level,
                                       ranks_per_package, max_ranks_per_package, package_ctr,
                                       package_level, num_ranks, 0 /*package_leaders_last */ ,
                                       1 /*right_skewed */ , errflag);
        if (mpi_errno) {
            /* for communication errors, just record the error but continue */
            *errflag =
                MPIX_ERR_PROC_FAILED ==
                MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
            MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
            MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
        }
    }

    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* each rank copy the reduce tree out */
    MPIDI_SHM_copy_tree(shared_region, num_ranks, rank, reduce_tree, reduce_topotree_fail);

    if (MPIDI_SHM_TOPOTREE_DEBUG)
        MPIDI_SHM_print_topotree_file("REDUCE", comm_ptr->context_id, rank, reduce_tree);
    /* Wait for all ranks to copy out the tree */
    mpi_errno = MPIR_Barrier_impl(comm_ptr, errflag);
    if (mpi_errno) {
        /* for communication errors, just record the error but continue */
        *errflag =
            MPIX_ERR_PROC_FAILED ==
            MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER;
        MPIR_ERR_SET(mpi_errno, *errflag, "**fail");
        MPIR_ERR_ADD(mpi_errno_ret, mpi_errno);
    }
    /* Cleanup */
    if (rank == root) {
        for (i = 0; i < max_entries_per_level[package_level]; ++i) {
            MPL_free(ranks_per_package[i]);
        }
        MPL_free(ranks_per_package);
        MPL_free(package_ctr);
        if (MPIDI_SHM_TOPOTREE_DEBUG)
            for (i = 0; i < topo_depth; ++i) {
                fprintf(stderr, "Level :: %d, Max :: %d\n", i, max_entries_per_level[i]);
            }
        for (i = 0; i < num_ranks; ++i) {
            MPL_free(bind_map[i]);
        }
        MPL_free(max_entries_per_level);
        MPL_free(bind_map);
    }
    MPIDIU_destroy_shm_segment(shm_size, &fd, (void **) &shared_region);

  fn_exit:
    if (rank == root && MPIDI_SHM_TOPOTREE_DEBUG)
        fprintf(stderr, "Done creating tree for %d\n", num_ranks);
    MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_SHM_TOPOLOGY_TREE_INIT);
    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Example #15
0
/*@

MPI_Cart_create - Makes a new communicator to which topology information
                  has been attached

Input Parameters:
+ comm_old - input communicator (handle) 
. ndims - number of dimensions of cartesian grid (integer) 
. dims - integer array of size ndims specifying the number of processes in 
  each dimension 
. periods - logical array of size ndims specifying whether the grid is 
  periodic (true) or not (false) in each dimension 
- reorder - ranking may be reordered (true) or not (false) (logical) 

Output Parameter:
. comm_cart - communicator with new cartesian topology (handle) 

Algorithm:
We ignore 'reorder' info currently.

.N fortran

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_TOPOLOGY
.N MPI_ERR_DIMS
.N MPI_ERR_ARG
@*/
int MPI_Cart_create ( MPI_Comm comm_old, int ndims, int *dims, int *periods, 
		      int reorder, MPI_Comm *comm_cart )
{
  int range[1][3];
  MPI_Group group_old, group;
  int i, rank, num_ranks = 1;
  int mpi_errno = MPI_SUCCESS;
  int flag, size;
  MPIR_TOPOLOGY *topo;
  struct MPIR_COMMUNICATOR *comm_old_ptr;
  static char myname[] = "MPI_CART_CREATE";

  TR_PUSH(myname);
  comm_old_ptr = MPIR_GET_COMM_PTR(comm_old);

  /* Check validity of arguments */
#ifndef MPIR_NO_ERROR_CHECKING
  MPIR_TEST_MPI_COMM(comm_old,comm_old_ptr,comm_old_ptr,myname);
  MPIR_TEST_ARG(comm_cart);
  MPIR_TEST_ARG(periods);
  if (ndims < 1 || dims == (int *)0) mpi_errno = MPI_ERR_DIMS;
  if (mpi_errno)
	return MPIR_ERROR(comm_old_ptr, mpi_errno, myname );
  
  /* Check for Intra-communicator */
  MPI_Comm_test_inter ( comm_old, &flag );
  if (flag)
    return MPIR_ERROR(comm_old_ptr, 
            MPIR_ERRCLASS_TO_CODE(MPI_ERR_COMM,MPIR_ERR_COMM_INTER), myname );
#endif

  /* Determine number of ranks in topology */
  for ( i=0; i<ndims; i++ )
    num_ranks    *= (dims[i]>0)?dims[i]:-dims[i];
  if ( num_ranks < 1 ) {
    (*comm_cart)  = MPI_COMM_NULL;
    return MPIR_ERROR( comm_old_ptr, MPI_ERR_TOPOLOGY, myname );
  }

  /* Is the old communicator big enough? */
  MPIR_Comm_size (comm_old_ptr, &size);
  if (num_ranks > size) {
      mpi_errno = MPIR_Err_setmsg( MPI_ERR_TOPOLOGY, MPIR_ERR_TOPO_TOO_LARGE, 
				   myname, 
                  "Topology size is larger than size of communicator",
		  "Topology size %d is greater than communicator size %d", 
				   num_ranks, size );
	return MPIR_ERROR(comm_old_ptr, mpi_errno, myname );
  }
	
  /* Make new comm */
  range[0][0] = 0; range[0][1] = num_ranks - 1; range[0][2] = 1;
  MPI_Comm_group ( comm_old, &group_old );
  MPI_Group_range_incl ( group_old, 1, range, &group );
  MPI_Comm_create  ( comm_old, group, comm_cart );
  MPI_Group_free( &group );
  MPI_Group_free( &group_old );

  /* Store topology information in new communicator */
  if ( (*comm_cart) != MPI_COMM_NULL ) {
      MPIR_ALLOC(topo,(MPIR_TOPOLOGY *) MPIR_SBalloc ( MPIR_topo_els ),
		 comm_old_ptr,MPI_ERR_EXHAUSTED,myname);
      MPIR_SET_COOKIE(&topo->cart,MPIR_CART_TOPOL_COOKIE)
	  topo->cart.type         = MPI_CART;
      topo->cart.nnodes       = num_ranks;
      topo->cart.ndims        = ndims;
      MPIR_ALLOC(topo->cart.dims,(int *)MALLOC( sizeof(int) * 3 * ndims ),
		 comm_old_ptr,MPI_ERR_EXHAUSTED,myname);
      topo->cart.periods      = topo->cart.dims + ndims;
      topo->cart.position     = topo->cart.periods + ndims;
      for ( i=0; i<ndims; i++ ) {
	  topo->cart.dims[i]    = dims[i];
	  topo->cart.periods[i] = periods[i];
      }

    /* Compute my position */
    MPI_Comm_rank ( (*comm_cart), &rank );
    for ( i=0; i < ndims; i++ ) {
      num_ranks = num_ranks / dims[i];
      topo->cart.position[i]  = rank / num_ranks;
      rank   = rank % num_ranks;
    }

    /* cache topology information */
    MPI_Attr_put ( (*comm_cart), MPIR_TOPOLOGY_KEYVAL, (void *)topo );
  }
  TR_POP;
  return (mpi_errno);
}
Example #16
0
static int intra_Scatterv(void *sendbuf,
                          int *sendcnts,
                          int *displs,
                          struct MPIR_DATATYPE *sendtype,
                          void *recvbuf,
                          int recvcnt,
                          struct MPIR_DATATYPE *recvtype,
                          int root, struct MPIR_COMMUNICATOR *comm)
{
    MPI_Status status;
    int rank, size;
    int mpi_errno = MPI_SUCCESS;
    static char myname[] = "MPI_SCATTERV";

    /* Get size and rank */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &rank);

    /* Check for invalid arguments */
#ifndef MPIR_NO_ERROR_CHECKING
    if (root >= size)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_ROOT_TOOBIG,
                                    myname, (char *) 0, (char *) 0, root,
                                    size);
    if (root < 0)
        mpi_errno = MPIR_Err_setmsg(MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname,
                                    (char *) 0, (char *) 0, root);
    if (mpi_errno)
        return MPIR_ERROR(comm, mpi_errno, myname);
#endif

    /* Switch communicators to the hidden collective */
    comm = comm->comm_coll;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* If I'm the root, then scatter */
    if (rank == root) {
        MPI_Aint extent;
        int i;

        MPI_Type_extent(sendtype->self, &extent);
        /* We could use Isend here, but since the receivers need to execute
           a simple Recv, it may not make much difference in performance, 
           and using the blocking version is simpler */
        for (i = 0; i < root; i++) {
            mpi_errno =
                MPI_Send((void *) ((char *) sendbuf + displs[i] * extent),
                         sendcnts[i], sendtype->self, i, MPIR_SCATTERV_TAG,
                         comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
        mpi_errno =
            MPI_Sendrecv((void *) ((char *) sendbuf +
                                   displs[rank] * extent), sendcnts[rank],
                         sendtype->self, rank, MPIR_SCATTERV_TAG, recvbuf,
                         recvcnt, recvtype->self, rank, MPIR_SCATTERV_TAG,
                         comm->self, &status);
        if (mpi_errno)
            return mpi_errno;

        for (i = root + 1; i < size; i++) {
            mpi_errno =
                MPI_Send((void *) ((char *) sendbuf + displs[i] * extent),
                         sendcnts[i], sendtype->self, i, MPIR_SCATTERV_TAG,
                         comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
    } else
        mpi_errno = MPI_Recv(recvbuf, recvcnt, recvtype->self, root,
                             MPIR_SCATTERV_TAG, comm->self, &status);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #17
0
static int intra_Allgatherv(void *sendbuf,
                            int sendcount,
                            struct MPIR_DATATYPE *sendtype,
                            void *recvbuf,
                            int *recvcounts,
                            int *displs,
                            struct MPIR_DATATYPE *recvtype,
                            struct MPIR_COMMUNICATOR *comm)
{
    int size, rank;
    int mpi_errno = MPI_SUCCESS;
    MPI_Status status;
    MPI_Aint recv_extent;
    int j, jnext, i, right, left;

    /* Get the size of the communicator */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &rank);

    /* Switch communicators to the hidden collective */
    comm = comm->comm_coll;

    /* Do a gather for each process in the communicator
       This is the "circular" algorithm for allgatherv - each process sends to
       its right and receives from its left.  This is faster than simply
       doing size Gathervs.
     */

    MPI_Type_extent(recvtype->self, &recv_extent);

    /* First, load the "local" version in the recvbuf. */
    mpi_errno =
        MPI_Sendrecv(sendbuf, sendcount, sendtype->self, rank,
                     MPIR_ALLGATHERV_TAG,
                     (void *) ((char *) recvbuf +
                               displs[rank] * recv_extent),
                     recvcounts[rank], recvtype->self, rank,
                     MPIR_ALLGATHERV_TAG, comm->self, &status);
    if (mpi_errno) {
        return mpi_errno;
    }

    left = (size + rank - 1) % size;
    right = (rank + 1) % size;

    j = rank;
    jnext = left;
    for (i = 1; i < size; i++) {
        mpi_errno =
            MPI_Sendrecv((void *) ((char *) recvbuf +
                                   displs[j] * recv_extent), recvcounts[j],
                         recvtype->self, right, MPIR_ALLGATHERV_TAG,
                         (void *) ((char *) recvbuf +
                                   displs[jnext] * recv_extent),
                         recvcounts[jnext], recvtype->self, left,
                         MPIR_ALLGATHERV_TAG, comm->self, &status);
        if (mpi_errno)
            break;
        j = jnext;
        jnext = (size + jnext - 1) % size;
    }
    return (mpi_errno);
}
Example #18
0
static int intra_Reduce_scatter(void *sendbuf,
                                void *recvbuf,
                                int *recvcnts,
                                struct MPIR_DATATYPE *datatype,
                                MPI_Op op, struct MPIR_COMMUNICATOR *comm)
{
    int rank, size, i, count = 0;
    MPI_Aint lb, ub, m_extent;  /* Extent in memory */
    int *displs;
    void *buffer;
    int mpi_errno = MPI_SUCCESS, rc;
    static char myname[] = "MPI_REDUCE_SCATTER";

    /* Determine the "count" of items to reduce and set the displacements */
    MPIR_Type_get_limits(datatype, &lb, &ub);
    m_extent = ub - lb;
    /* MPI_Type_extent (datatype, &extent); */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &rank);

    /* Allocate the displacements and initialize them */
    MPIR_ALLOC(displs, (int *) MALLOC(size * sizeof(int)), comm,
               MPI_ERR_EXHAUSTED, myname);
    for (i = 0; i < size; i++) {
        displs[i] = count;
        count += recvcnts[i];
        if (recvcnts[i] < 0) {
            FREE(displs);
            mpi_errno =
                MPIR_Err_setmsg(MPI_ERR_COUNT, MPIR_ERR_COUNT_ARRAY_NEG,
                                myname, (char *) 0, (char *) 0, i,
                                recvcnts[i]);
            return mpi_errno;
        }
    }

    /* Allocate a temporary buffer */
    if (count == 0) {
        FREE(displs);
        return MPI_SUCCESS;
    }

    MPIR_ALLOC(buffer, (void *) MALLOC(m_extent * count), comm,
               MPI_ERR_EXHAUSTED, myname);
    buffer = (void *) ((char *) buffer - lb);

    /* Reduce to 0, then scatter */
    mpi_errno = MPI_Reduce(sendbuf, buffer, count, datatype->self, op, 0,
                           comm->self);
    if (mpi_errno == MPI_SUCCESS || mpi_errno == MPIR_ERR_OP_NOT_DEFINED) {
        rc = MPI_Scatterv(buffer, recvcnts, displs, datatype->self,
                          recvbuf, recvcnts[rank], datatype->self, 0,
                          comm->self);
        if (rc)
            mpi_errno = rc;
    }
    /* Free the temporary buffers */
    FREE((char *) buffer + lb);
    FREE(displs);
    return (mpi_errno);
}
Example #19
0
static int intra_Alltoallv(void *sendbuf,
                           int *sendcnts,
                           int *sdispls,
                           struct MPIR_DATATYPE *sendtype,
                           void *recvbuf,
                           int *recvcnts,
                           int *rdispls,
                           struct MPIR_DATATYPE *recvtype,
                           struct MPIR_COMMUNICATOR *comm)
{
    int size, i, j, rcnt;
    MPI_Aint send_extent, recv_extent;
    int mpi_errno = MPI_SUCCESS;
    MPI_Status *starray;
    MPI_Request *reqarray;

    /* Get size and switch to collective communicator */
    MPIR_Comm_size(comm, &size);
    comm = comm->comm_coll;

    /* Get extent of send and recv types */
    MPI_Type_extent(sendtype->self, &send_extent);
    MPI_Type_extent(recvtype->self, &recv_extent);

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* 1st, get some storage from the heap to hold handles, etc. */
    MPIR_ALLOC(starray,
               (MPI_Status *) MALLOC(2 * size * sizeof(MPI_Status)), comm,
               MPI_ERR_EXHAUSTED, "MPI_ALLTOALLV");

    MPIR_ALLOC(reqarray,
               (MPI_Request *) MALLOC(2 * size * sizeof(MPI_Request)),
               comm, MPI_ERR_EXHAUSTED, "MPI_ALLTOALLV");

    /* do the communication -- post *all* sends and receives: */
    rcnt = 0;
    for (i = 0; i < size; i++) {
        reqarray[2 * i] = MPI_REQUEST_NULL;
        if ((mpi_errno = MPI_Irecv((void *) ((char *) recvbuf +
                                             rdispls[i] * recv_extent),
                                   recvcnts[i], recvtype->self, i,
                                   MPIR_ALLTOALLV_TAG, comm->self,
                                   &reqarray[2 * i + 1]))
            )
            break;
        rcnt++;
        if ((mpi_errno = MPI_Isend((void *) ((char *) sendbuf +
                                             sdispls[i] * send_extent),
                                   sendcnts[i], sendtype->self, i,
                                   MPIR_ALLTOALLV_TAG, comm->self,
                                   &reqarray[2 * i]))
            )
            break;
        rcnt++;
    }

    /* ... then wait for *all* of them to finish: */
    if (mpi_errno) {
        /* We should really cancel all of the active requests */
        for (j = 0; j < rcnt; j++) {
            MPI_Cancel(&reqarray[j]);
        }
    } else {
        mpi_errno = MPI_Waitall(2 * size, reqarray, starray);
        if (mpi_errno == MPI_ERR_IN_STATUS) {
            for (j = 0; j < 2 * size; j++) {
                if (starray[j].MPI_ERROR != MPI_SUCCESS)
                    mpi_errno = starray[j].MPI_ERROR;
            }
        }
    }

    /* clean up */
    FREE(reqarray);
    FREE(starray);

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #20
0
static int intra_Scan(void *sendbuf,
                      void *recvbuf,
                      int count,
                      struct MPIR_DATATYPE *datatype,
                      MPI_Op op, struct MPIR_COMMUNICATOR *comm)
{
    MPI_Status status;
    int rank, size;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint lb, ub, m_extent;  /* Extent in memory */
    MPI_User_function *uop;
    struct MPIR_OP *op_ptr;
    MPIR_ERROR_DECL;
    mpi_comm_err_ret = 0;

    /* See the overview in Collection Operations for why this is ok */
    if (count == 0)
        return MPI_SUCCESS;

    /* Get my rank & size and switch communicators to the hidden collective */
    MPIR_Comm_size(comm, &size);
    MPIR_Comm_rank(comm, &rank);
    MPIR_Type_get_limits(datatype, &lb, &ub);
    m_extent = ub - lb;
    comm = comm->comm_coll;
    op_ptr = MPIR_GET_OP_PTR(op);
    MPIR_TEST_MPI_OP(op, op_ptr, comm, "MPI_SCAN");
    uop = op_ptr->op;

    /* Lock for collective operation */
    MPID_THREAD_LOCK(comm->ADIctx, comm);

    /* 
       This is an O(size) algorithm.  A modification of the algorithm in 
       reduce.c can be used to make this O(log(size)) 
     */
    /* commutative case requires no extra buffering */
    MPIR_Op_errno = MPI_SUCCESS;
    if (op_ptr->commute) {
        /* Do the scan operation */
        if (rank > 0) {
            mpi_errno = MPI_Recv(recvbuf, count, datatype->self, rank - 1,
                                 MPIR_SCAN_TAG, comm->self, &status);
            if (mpi_errno)
                return mpi_errno;
            /* See reduce for why pop/push */
            MPIR_ERROR_POP(comm);
            (*uop) (sendbuf, recvbuf, &count, &datatype->self);
            MPIR_ERROR_PUSH(comm);
        } else {
            MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf,
                          MPIR_SCAN_TAG, rank, comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
    }
    /* non-commutative case requires extra buffering */
    else {
        /* Do the scan operation */
        if (rank > 0) {
            void *tmpbuf;
            MPIR_ALLOC(tmpbuf, (void *) MALLOC(m_extent * count),
                       comm, MPI_ERR_EXHAUSTED, "MPI_SCAN");
            tmpbuf = (void *) ((char *) tmpbuf - lb);
            MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf,
                          MPIR_SCAN_TAG, rank, comm->self);
            if (mpi_errno)
                return mpi_errno;
            mpi_errno = MPI_Recv(tmpbuf, count, datatype->self, rank - 1,
                                 MPIR_SCAN_TAG, comm->self, &status);
            if (mpi_errno)
                return mpi_errno;
            (*uop) (tmpbuf, recvbuf, &count, &datatype->self);
            FREE((char *) tmpbuf + lb);
        } else {
            MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf,
                          MPIR_SCAN_TAG, rank, comm->self);
            if (mpi_errno)
                return mpi_errno;
        }
    }

    /* send the letter to destination */
    if (rank < (size - 1))
        mpi_errno =
            MPI_Send(recvbuf, count, datatype->self, rank + 1,
                     MPIR_SCAN_TAG, comm->self);

    /* If the predefined operation detected an error, report it here */
    if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno)
        mpi_errno = MPIR_Op_errno;

    /* Unlock for collective operation */
    MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    return (mpi_errno);
}
Example #21
0
/* Now the functions */
static int intra_Barrier(struct MPIR_COMMUNICATOR *comm)
{
    int rank, size, N2_prev, surfeit;
    int d, dst, src;
    MPI_Status status;

    /* Intialize communicator size */
    (void) MPIR_Comm_size(comm, &size);

#ifdef MPID_Barrier
    if (comm->ADIBarrier) {
        MPID_Barrier(comm->ADIctx, comm);
        return MPI_SUCCESS;
    }
#endif
    /* If there's only one member, this is trivial */
    if (size > 1) {

        /* Initialize collective communicator */
        comm = comm->comm_coll;
        (void) MPIR_Comm_rank(comm, &rank);
        (void) MPIR_Comm_N2_prev(comm, &N2_prev);
        surfeit = size - N2_prev;

        /* Lock for collective operation */
        MPID_THREAD_LOCK(comm->ADIctx, comm);

        /* Perform a combine-like operation */
        if (rank < N2_prev) {
            if (rank < surfeit) {

                /* get the fanin letter from the upper "half" process: */
                dst = N2_prev + rank;

                MPI_Recv((void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG,
                         comm->self, &status);
            }

            /* combine on embedded N2_prev power-of-two processes */
            for (d = 1; d < N2_prev; d <<= 1) {
                dst = (rank ^ d);

                MPI_Sendrecv((void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG,
                             (void *) 0, 0, MPI_INT, dst, MPIR_BARRIER_TAG,
                             comm->self, &status);
            }

            /* fanout data to nodes above N2_prev... */
            if (rank < surfeit) {
                dst = N2_prev + rank;
                MPI_Send((void *) 0, 0, MPI_INT, dst,
                         MPIR_BARRIER_TAG, comm->self);
            }
        } else {
            /* fanin data to power of 2 subset */
            src = rank - N2_prev;
            MPI_Sendrecv((void *) 0, 0, MPI_INT, src, MPIR_BARRIER_TAG,
                         (void *) 0, 0, MPI_INT, src, MPIR_BARRIER_TAG,
                         comm->self, &status);
        }

        /* Unlock for collective operation */
        MPID_THREAD_UNLOCK(comm->ADIctx, comm);

    }
    return (MPI_SUCCESS);
}
Example #22
0
/*@

MPI_Intercomm_create - Creates an intercommuncator from two intracommunicators

Input Paramters:
+ local_comm - Local (intra)communicator
. local_leader - Rank in local_comm of leader (often 0)
. peer_comm - Remote communicator
. remote_leader - Rank in peer_comm of remote leader (often 0)
- tag - Message tag to use in constructing intercommunicator; if multiple
  'MPI_Intercomm_creates' are being made, they should use different tags (more
  precisely, ensure that the local and remote leaders are using different
  tags for each 'MPI_intercomm_create').

Output Parameter:
. comm_out - Created intercommunicator

Notes:
  The MPI 1.1 Standard contains two mutually exclusive comments on the
  input intracommunicators.  One says that their repective groups must be
  disjoint; the other that the leaders can be the same process.  After
  some discussion by the MPI Forum, it has been decided that the groups must
  be disjoint.  Note that the `reason` given for this in the standard is
  `not` the reason for this choice; rather, the `other` operations on 
  intercommunicators (like 'MPI_Intercomm_merge') do not make sense if the
  groups are not disjoint.

.N fortran

Algorithm:
+ 1) Allocate a send context, an inter-coll context, and an intra-coll context
. 2) Send "send_context" and lrank_to_grank list from local comm group 
     if I''m the local_leader.
. 3) If I''m the local leader, then wait on the posted sends and receives
     to complete.  Post the receive for the remote group information and
	 wait for it to complete.
. 4) Broadcast information received from the remote leader.  
. 5) Create the inter_communicator from the information we now have.
-    An inter-communicator ends up with three levels of communicators. 
     The inter-communicator returned to the user, a "collective" 
     inter-communicator that can be used for safe communications between
     local & remote groups, and a collective intra-communicator that can 
     be used to allocate new contexts during the merge and dup operations.

	 For the resulting inter-communicator, 'comm_out'

.vb
       comm_out                       = inter-communicator
       comm_out->comm_coll            = "collective" inter-communicator
       comm_out->comm_coll->comm_coll = safe collective intra-communicator
.ve

.N Errors
.N MPI_SUCCESS
.N MPI_ERR_COMM
.N MPI_ERR_TAG
.N MPI_ERR_EXHAUSTED
.N MPI_ERR_RANK

.seealso: MPI_Intercomm_merge, MPI_Comm_free, MPI_Comm_remote_group, 
          MPI_Comm_remote_size
@*/
EXPORT_MPI_API int MPI_Intercomm_create ( MPI_Comm local_comm, int local_leader, 
			   MPI_Comm peer_comm, int remote_leader, int tag, 
			   MPI_Comm *comm_out )
{
  int              local_size, local_rank, peer_size, peer_rank;
  int              remote_size;
  int              mpi_errno = MPI_SUCCESS;
  MPIR_CONTEXT     context, send_context;
  struct MPIR_GROUP *remote_group_ptr;
  struct MPIR_COMMUNICATOR *new_comm, *local_comm_ptr, *peer_comm_ptr;
  MPI_Request      req[6];
  MPI_Status       status[6];
  MPIR_ERROR_DECL;
  static char myname[]="MPI_INTERCOMM_CREATE";

  TR_PUSH(myname);
  local_comm_ptr = MPIR_GET_COMM_PTR(local_comm);

  
#ifndef MPIR_NO_ERROR_CHECKING
  /* Check for valid arguments to function */
  MPIR_TEST_MPI_COMM(local_comm,local_comm_ptr,local_comm_ptr,myname);
  MPIR_TEST_SEND_TAG(tag);
  if (mpi_errno)
      return MPIR_ERROR(local_comm_ptr, mpi_errno, myname );
#endif

  if (local_comm  == MPI_COMM_NULL) {
      mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_LOCAL_COMM, myname, 
		   "Local communicator must not be MPI_COMM_NULL", (char *)0 );
      return MPIR_ERROR( local_comm_ptr, mpi_errno, myname );
  }

  (void) MPIR_Comm_size ( local_comm_ptr, &local_size );
  (void) MPIR_Comm_rank ( local_comm_ptr, &local_rank );

  if ( local_leader == local_rank ) {
      /* Peer_comm need be valid only at local_leader */
      peer_comm_ptr = MPIR_GET_COMM_PTR(peer_comm);
      if ((MPIR_TEST_COMM_NOTOK(peer_comm,peer_comm_ptr) || 
	   (peer_comm == MPI_COMM_NULL))) {
	  mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_PEER_COMM,
			       myname, "Peer communicator is not valid", 
				       (char *)0 );
      return MPIR_ERROR( local_comm_ptr, mpi_errno, myname );
      }

    (void) MPIR_Comm_size ( peer_comm_ptr,  &peer_size  );
    (void) MPIR_Comm_rank ( peer_comm_ptr,  &peer_rank  );

    if (((peer_rank     == MPI_UNDEFINED) && (mpi_errno = MPI_ERR_RANK)))
	return MPIR_ERROR( local_comm_ptr, mpi_errno, myname );

    if (((remote_leader >= peer_size)     && (mpi_errno = MPI_ERR_RANK)) || 
        ((remote_leader <  0)             && (mpi_errno = MPI_ERR_RANK))) {
	mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_REMOTE_RANK, 
				     myname, 
				     "Error specifying remote_leader", 
"Error specifying remote_leader; value %d not between 0 and %d", remote_leader, peer_size );
       return MPIR_ERROR( local_comm_ptr, mpi_errno, myname );
    }
  }

  if (((local_leader  >= local_size)    && (mpi_errno = MPI_ERR_RANK)) || 
      ((local_leader  <  0)             && (mpi_errno = MPI_ERR_RANK))) {
	mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_LOCAL_RANK, 
				     myname, 
				     "Error specifying local_leader", 
"Error specifying local_leader; value %d not in between 0 and %d", local_leader, local_size );
       return MPIR_ERROR( local_comm_ptr, mpi_errno, myname );
    }

  /* Allocate send context, inter-coll context and intra-coll context */
  MPIR_Context_alloc ( local_comm_ptr, 3, &context );

  
  /* If I'm the local leader, then exchange information */
  if (local_rank == local_leader) {
      MPIR_ERROR_PUSH(peer_comm_ptr);

      /* Post the receives for the information from the remote_leader */
      /* We don't post a receive for the remote group yet, because we */
      /* don't know how big it is yet. */
      MPIR_CALL_POP(MPI_Irecv (&remote_size, 1, MPI_INT, remote_leader, tag,
			       peer_comm, &(req[2])),peer_comm_ptr,myname);
      MPIR_CALL_POP(MPI_Irecv (&send_context, 1, MPIR_CONTEXT_TYPE, 
			       remote_leader,tag, peer_comm, &(req[3])),
		    peer_comm_ptr,myname);
    
      /* Send the lrank_to_grank table of the local_comm and an allocated */
      /* context. Currently I use multiple messages to send this info.    */
      /* Eventually, this will change(?) */
      MPIR_CALL_POP(MPI_Isend (&local_size, 1, MPI_INT, remote_leader, tag, 
               peer_comm, &(req[0])),peer_comm_ptr,myname);
      MPIR_CALL_POP(MPI_Isend (&context, 1, MPIR_CONTEXT_TYPE, remote_leader, 
               tag, peer_comm, &(req[1])),peer_comm_ptr,myname);
    
      /* Wait on the communication requests to finish */
      MPIR_CALL_POP(MPI_Waitall ( 4, req, status ),peer_comm_ptr,myname);
    
      /* We now know how big the remote group is, so create it */
      remote_group_ptr = MPIR_CreateGroup ( remote_size );
      remote_group_ptr->self = 
	  (MPI_Group) MPIR_FromPointer( remote_group_ptr );

      /* Post the receive for the group information */
      MPIR_CALL_POP(MPI_Irecv (remote_group_ptr->lrank_to_grank, remote_size, 
			       MPI_INT, remote_leader, tag, peer_comm, 
			       &(req[5])),peer_comm_ptr,myname);
    
      /* Send the local group info to the remote group */
      MPIR_CALL_POP(MPI_Isend (local_comm_ptr->group->lrank_to_grank, local_size, 
			       MPI_INT, remote_leader, tag, peer_comm, 
			       &(req[4])),peer_comm_ptr,myname);
    
      /* wait on the send and the receive for the group information */
      MPIR_CALL_POP(MPI_Waitall ( 2, &(req[4]), &(status[4]) ),peer_comm_ptr,
		    myname);
      MPIR_ERROR_POP(peer_comm_ptr);

      /* Now we can broadcast the group information to the other local comm */
      /* members. */
      MPIR_ERROR_PUSH(local_comm_ptr);
      MPIR_CALL_POP(MPI_Bcast(&remote_size,1,MPI_INT,local_rank,local_comm),
		    local_comm_ptr,myname);
      MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, 
			      MPI_INT, local_rank, local_comm),local_comm_ptr,
		    myname);
      MPIR_ERROR_POP(local_comm_ptr);
  }
  /* Else I'm just an ordinary comm member, so receive the bcast'd */
  /* info about the remote group */
  else {
      MPIR_ERROR_PUSH(local_comm_ptr);
      MPIR_CALL_POP(MPI_Bcast(&remote_size, 1, MPI_INT, local_leader,
			      local_comm),local_comm_ptr,myname);
    
      /* We now know how big the remote group is, so create it */
      remote_group_ptr = MPIR_CreateGroup ( remote_size );
      remote_group_ptr->self = 
	  (MPI_Group) MPIR_FromPointer( remote_group_ptr );
	
      /* Receive the group info */
      MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, 
			      MPI_INT, local_leader, local_comm),
		    local_comm_ptr,myname );
      MPIR_ERROR_POP(local_comm_ptr);
  }

  MPIR_ERROR_PUSH(local_comm_ptr);
  /* Broadcast the send context */
  MPIR_CALL_POP(MPI_Bcast(&send_context, 1, MPIR_CONTEXT_TYPE, 
			  local_leader, local_comm),local_comm_ptr,myname);
  MPIR_ERROR_POP(local_comm_ptr);

  /* We all now have all the information necessary, start building the */
  /* inter-communicator */
  MPIR_ALLOC(new_comm,NEW(struct MPIR_COMMUNICATOR),local_comm_ptr, 
	     MPI_ERR_EXHAUSTED,myname );
  MPIR_Comm_init( new_comm, local_comm_ptr, MPIR_INTER );
  *comm_out = new_comm->self;
  new_comm->group = remote_group_ptr;
  MPIR_Group_dup( local_comm_ptr->group, &(new_comm->local_group) );
  new_comm->local_rank	   = new_comm->local_group->local_rank;
  new_comm->lrank_to_grank = new_comm->group->lrank_to_grank;
  new_comm->np             = new_comm->group->np;
  new_comm->send_context   = send_context;
  new_comm->recv_context   = context;
  new_comm->comm_name      = 0;
  if ((mpi_errno = MPID_CommInit( local_comm_ptr, new_comm )) )
      return mpi_errno;
  (void) MPIR_Attr_create_tree ( new_comm );

  /* Build the collective inter-communicator */
  MPIR_Comm_make_coll( new_comm, MPIR_INTER );
  MPIR_Comm_make_onesided( new_comm, MPIR_INTER );
  
  /* Build the collective intra-communicator.  Note that we require
     an intra-communicator for the "coll_comm" so that MPI_COMM_DUP
     can use it for some collective operations (do we need this
     for MPI-2 with intercommunicator collective?) 
     
     Note that this really isn't the right thing to do; we need to replace
     *all* of the Mississippi state collective code.
   */
  MPIR_Comm_make_coll( new_comm->comm_coll, MPIR_INTRA );
#if 0
  MPIR_Comm_make_coll( new_comm->comm_onesided, MPIR_INTRA );
#endif
  
  /* Remember it for the debugger */
  MPIR_Comm_remember ( new_comm );

  TR_POP;
  return (mpi_errno);
}