Example #1
0
int smpi_coll_tuned_alltoall_ompi2(void *sendbuf, int sendcount,
                                   MPI_Datatype sendtype, void *recvbuf,
                                   int recvcount, MPI_Datatype recvtype,
                                   MPI_Comm comm)
{
  int size, sendsize;  
  size = smpi_comm_size(comm);  
  sendsize = smpi_datatype_size(sendtype) * sendcount;  
  if (sendsize < 200 && size > 12) {
    return
        smpi_coll_tuned_alltoall_bruck(sendbuf, sendcount, sendtype,
                                       recvbuf, recvcount, recvtype,
                                       comm);
  } else if (sendsize < 3000) {
    return
        smpi_coll_tuned_alltoall_basic_linear(sendbuf, sendcount,
                                              sendtype, recvbuf,
                                              recvcount, recvtype, comm);
  } else {
    return
        smpi_coll_tuned_alltoall_ring(sendbuf, sendcount, sendtype,
                                      recvbuf, recvcount, recvtype,
                                      comm);
  }
}
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
int
smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype,
                                   void *rbuf, int rcount, MPI_Datatype rtype,
                                   MPI_Comm comm)
{
    MPI_Aint rextent, sextent;
    MPI_Status status, status2;
    int i, to, from, rank, size;
    int send_offset, recv_offset;
    int tag = COLL_TAG_ALLGATHER;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    rextent = smpi_datatype_get_extent(rtype);
    sextent = smpi_datatype_get_extent(stype);
    MPI_Request *rrequest_array;
    MPI_Request *srequest_array;
    rrequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));
    srequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));

    // irregular case use default MPI fucntions
    if (scount * sextent != rcount * rextent) {
        XBT_WARN("MPI_allgather_NTSLR_NB use default MPI_allgather.");
        smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
        return MPI_SUCCESS;
    }

    // topo non-specific
    to = (rank + 1) % size;
    from = (rank + size - 1) % size;

    //copy a single segment from sbuf to rbuf
    send_offset = rank * scount * sextent;

    smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
                      (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status);


    //start sending logical ring message
    int increment = scount * sextent;

    //post all irecv first
    for (i = 0; i < size - 1; i++) {
        recv_offset = ((rank - i - 1 + size) % size) * increment;
        rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm);
    }


    for (i = 0; i < size - 1; i++) {
        send_offset = ((rank - i + size) % size) * increment;
        srequest_array[i] = smpi_mpi_isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm);
        smpi_mpi_wait(&rrequest_array[i], &status);
        smpi_mpi_wait(&srequest_array[i], &status2);
    }

    free(rrequest_array);
    free(srequest_array);

    return MPI_SUCCESS;
}
Example #3
0
int smpi_coll_tuned_alltoall_ompi( void *sbuf, int scount, 
                                             MPI_Datatype sdtype,
                                             void* rbuf, int rcount, 
                                             MPI_Datatype rdtype, 
                                             MPI_Comm comm)
{
    int communicator_size;
    size_t dsize, block_dsize;
    communicator_size = smpi_comm_size(comm);

    /* Decision function based on measurement on Grig cluster at 
       the University of Tennessee (2GB MX) up to 64 nodes.
       Has better performance for messages of intermediate sizes than the old one */
    /* determine block size */
    dsize = smpi_datatype_size(sdtype);
    block_dsize = dsize * scount;

    if ((block_dsize < 200) && (communicator_size > 12)) {
        return smpi_coll_tuned_alltoall_bruck(sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm);

    } else if (block_dsize < 3000) {
        return smpi_coll_tuned_alltoall_basic_linear(sbuf, scount, sdtype, 
                                                           rbuf, rcount, rdtype, 
                                                           comm);
    }

    return smpi_coll_tuned_alltoall_ring (sbuf, scount, sdtype, 
                                                    rbuf, rcount, rdtype,
                                                    comm);
}
/*****************************************************************************

 * Function: alltoall_pair_mpi_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works when P is power of two. In each phase of P - 1
           phases, nodes in pair communicate their data. MPI barriers are
           inserted between each two phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoallv_pair_mpi_barrier(void *send_buff, int *send_counts, int *send_disps,
                                          MPI_Datatype send_type,
                                          void *recv_buff, int *recv_counts, int *recv_disps,
                                          MPI_Datatype recv_type, MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = 101;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  for (i = 0; i < num_procs; i++) {
    src = dst = rank ^ i;
    smpi_mpi_barrier(comm);
    smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst,
                 tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type,
                 src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
Example #5
0
/*****************************************************************************

 * Function: alltoall_ring

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j + i.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_ring(void *send_buff, int send_count,
                              MPI_Datatype send_type, void *recv_buff,
                              int recv_count, MPI_Datatype recv_type,
                              MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALL;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  send_chunk *= send_count;
  recv_chunk *= recv_count;

  for (i = 0; i < num_procs; i++) {
    src = (rank - i + num_procs) % num_procs;
    dst = (rank + i) % num_procs;

    smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
                 tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
                 src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
Example #6
0
int
smpi_coll_tuned_bcast_flattree(void *buff, int count, MPI_Datatype data_type,
                               int root, MPI_Comm comm)
{
  MPI_Request *req_ptr;
  MPI_Request *reqs;

  int i, rank, num_procs;
  int tag = 1;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);

  if (rank != root) {
    smpi_mpi_recv(buff, count, data_type, root, tag, comm, MPI_STATUS_IGNORE);
  }

  else {
    reqs = (MPI_Request *) xbt_malloc((num_procs - 1) * sizeof(MPI_Request));
    req_ptr = reqs;

    // Root sends data to all others
    for (i = 0; i < num_procs; i++) {
      if (i == rank)
        continue;
      *(req_ptr++) = smpi_mpi_isend(buff, count, data_type, i, tag, comm);
    }

    // wait on all requests
    smpi_mpi_waitall(num_procs - 1, reqs, MPI_STATUSES_IGNORE);

    free(reqs);
  }
  return MPI_SUCCESS;
}
Example #7
0
int smpi_coll_tuned_reduce_ompi_chain( void *sendbuf, void *recvbuf, int count,
                                        MPI_Datatype datatype, 
                                        MPI_Op  op, int root, 
                                        MPI_Comm  comm
                                        )
{
    uint32_t segsize=64*1024;
    int segcount = count;
    size_t typelng;
    int fanout = smpi_comm_size(comm)/2;

    XBT_DEBUG("coll:tuned:reduce_intra_chain rank %d fo %d ss %5d", smpi_comm_rank(comm), fanout, segsize);

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng = smpi_datatype_size( datatype);
    
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm,
                                           ompi_coll_tuned_topo_build_chain(fanout, comm, root), 
                                           segcount, 0 );
}
Example #8
0
/*****************************************************************************
 * Function: allgather_ring
 * return: int
 * inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j+ i.
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgather_ring(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int recv_count, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  int i, src, dst, rank, num_procs;
  int tag = 1;
  MPI_Status status;

  char *sendptr = (char *) send_buff;
  char *recvptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(send_type);

  // local send/recv
  smpi_mpi_sendrecv(sendptr, send_count, send_type, rank, tag,
               recvptr + rank * recv_count * extent,
               recv_count, recv_type, rank, tag, comm, &status);

  for (i = 1; i < num_procs; i++) {
    src = (rank - i + num_procs) % num_procs;
    dst = (rank + i) % num_procs;
    smpi_mpi_sendrecv(sendptr, send_count, send_type, dst, tag,
                 recvptr + src * recv_count * extent, recv_count, recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
Example #9
0
int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
					)
{
    int rank, size;
    int distance, to, from;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);

    /* exchange data with rank-2^k and rank+2^k */
    for (distance = 1; distance < size; distance <<= 1) { 
        from = (rank + size - distance) % size;
        to   = (rank + distance) % size;

        /* send message to lower ranked node */
        smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to, 
                                              COLL_TAG_BARRIER,
                                              NULL, 0, MPI_BYTE, from, 
                                              COLL_TAG_BARRIER,
                                              comm, MPI_STATUS_IGNORE);
    }

    return MPI_SUCCESS;

}
Example #10
0
int
smpi_coll_tuned_reduce_flat_tree(void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = 4321;
  int size;
  int rank;
  MPI_Aint extent;
  char *origin = 0;
  char *inbuf;
  MPI_Status status;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);

  /* If not root, send data to the root. */
  extent = smpi_datatype_get_extent(dtype);

  if (rank != root) {
    smpi_mpi_send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = (char *) xbt_malloc(count * extent);


  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    smpi_mpi_sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    smpi_mpi_recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = sbuf;
    else {
      smpi_mpi_recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    smpi_op_apply(op, inbuf, rbuf, &count, &dtype);

  }

  if (origin)
    free(origin);

  /* All done */
  return 0;
}
Example #11
0
/*
 *	scatter_intra
 *
 *	Function:	- basic scatter operation
 *	Accepts:	- same arguments as MPI_Scatter()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
smpi_coll_tuned_scatter_ompi_basic_linear(void *sbuf, int scount,
					   MPI_Datatype sdtype,
					   void *rbuf, int rcount,
					   MPI_Datatype rdtype,
					   int root,
					   MPI_Comm comm
					   )
{
    int i, rank, size, err;
    char *ptmp;
    ptrdiff_t lb, incr;

    /* Initialize */

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);

    /* If not root, receive data. */

    if (rank != root) {
        smpi_mpi_recv(rbuf, rcount, rdtype, root,
                                COLL_TAG_SCATTER,
                                comm, MPI_STATUS_IGNORE);
        return MPI_SUCCESS;
    }

    /* I am the root, loop sending data. */

    err = smpi_datatype_extent(sdtype, &lb, &incr);
    if (MPI_SUCCESS != err) {
        return MPI_ERR_OTHER;
    }

    incr *= scount;
    for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {

        /* simple optimization */

        if (i == rank) {
            if (MPI_IN_PLACE != rbuf) {
                err =
                    smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount,
                                    rdtype);
            }
        } else {
            smpi_mpi_send(ptmp, scount, sdtype, i,
                                    COLL_TAG_SCATTER,
                                     comm);
        }
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #12
0
int smpi_coll_tuned_gather_ompi(void *sbuf, int scount, 
                                           MPI_Datatype sdtype,
                                           void* rbuf, int rcount, 
                                           MPI_Datatype rdtype, 
                                           int root,
                                           MPI_Comm  comm
                                           )
{
    //const int large_segment_size = 32768;
    //const int small_segment_size = 1024;

    //const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_gather_ompi");

    communicator_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    // Determine block size 
    if (rank == root) {
        dsize = smpi_datatype_size(rdtype);
        block_size = dsize * rcount;
    } else {
        dsize = smpi_datatype_size(sdtype);
        block_size = dsize * scount;
    }

/*    if (block_size > large_block_size) {*/
/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
/*                                                         rbuf, rcount, rdtype, */
/*                                                         root, comm);*/

/*    } else*/ if (block_size > intermediate_block_size) {
        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         root, comm);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return smpi_coll_tuned_gather_ompi_binomial (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm);

    }
    // Otherwise, use basic linear 
    return smpi_coll_tuned_gather_ompi_basic_linear (sbuf, scount, sdtype, 
                                                      rbuf, rcount, rdtype, 
                                                      root, comm);
}
Example #13
0
int smpi_coll_tuned_reduce_scatter_ompi( void *sbuf, void *rbuf,
                                                    int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("smpi_coll_tuned_reduce_scatter_ompi");
    
    comm_size = smpi_comm_size(comm);
    // We need data size for decision function 
    dsize=smpi_datatype_size(dtype);
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) { 
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if( !smpi_op_is_commute(op) || (zerocounts)) {
        smpi_mpi_reduce_scatter (sbuf, rbuf, rcounts, 
                                                                    dtype, op, 
                                                                    comm); 
        return MPI_SUCCESS;
    }
   
    total_message_size *= dsize;

    // compute the nearest power of 2 
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return 
            smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    } 
    return smpi_coll_tuned_reduce_scatter_ompi_ring(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
Example #14
0
/*
 * Simple double ring version of barrier
 *
 * synchronous gurantee made by last ring of sends are synchronous
 *
 */
int smpi_coll_tuned_barrier_ompi_doublering(MPI_Comm comm
					     )
{
    int rank, size;
    int left, right;


    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);

    XBT_DEBUG("ompi_coll_tuned_barrier_ompi_doublering rank %d", rank);

    left = ((rank-1+size)%size);
    right = ((rank+1)%size);

    if (rank > 0) { /* receive message from the left */
        smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, 
                                COLL_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE);
    }

    /* Send message to the right */
    smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, 
                            COLL_TAG_BARRIER,
                             comm);

    /* root needs to receive from the last node */
    if (rank == 0) {
        smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, 
                                COLL_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE);
    }

    /* Allow nodes to exit */
    if (rank > 0) { /* post Receive from left */
        smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, 
                                COLL_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE);
    }

    /* send message to the right one */
    smpi_mpi_send((void*)NULL, 0, MPI_BYTE, right, 
                            COLL_TAG_BARRIER,
                             comm);
 
    /* rank 0 post receive from the last node */
    if (rank == 0) {
        smpi_mpi_recv((void*)NULL, 0, MPI_BYTE, left, 
                                COLL_TAG_BARRIER, comm,
                                MPI_STATUS_IGNORE);
    }

    return MPI_SUCCESS;

}
Example #15
0
/**
 * Alltoall basic_linear (STARMPI:alltoall-simple)
 **/
int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount,
                                          MPI_Datatype sendtype,
                                          void *recvbuf, int recvcount,
                                          MPI_Datatype recvtype,
                                          MPI_Comm comm)
{
  int system_tag = 888;
  int i, rank, size, err, count;
  MPI_Aint lb = 0, sendext = 0, recvext = 0;
  MPI_Request *requests;

  /* Initialize. */
  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
  smpi_datatype_extent(sendtype, &lb, &sendext);
  smpi_datatype_extent(recvtype, &lb, &recvext);
  /* simple optimization */
  err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, 
                           sendcount, sendtype, 
                           (char *)recvbuf + rank * recvcount * recvext, 
                           recvcount, recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    /* Post all receives first -- a simple optimization */
    count = 0;
    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
      requests[count] =
          smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, 
                          recvtype, i, system_tag, comm);
      count++;
    }
    /* Now post all sends in reverse order
     *   - We would like to minimize the search time through message queue
     *     when messages actually arrive in the order in which they were posted.
     * TODO: check the previous assertion
     */
    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
      requests[count] =
          smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
                          sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    smpi_mpi_startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
    for(i = 0; i < count; i++) {
      if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
    }
    xbt_free(requests);
  }
  return err;
}
int smpi_coll_tuned_barrier_mvapich2_pair(MPI_Comm comm)
{

    int size, rank;
    int d, dst, src;
    int mpi_errno = MPI_SUCCESS;

    size = smpi_comm_size(comm);
    /* Trivial barriers return immediately */
    if (size == 1)
        return MPI_SUCCESS;

    rank =  smpi_comm_rank(comm);
    int N2_prev = 1;
    /*  N2_prev = greatest power of two < size of Comm  */
    for( N2_prev = 1; N2_prev <= size; N2_prev <<= 1 );
    N2_prev >>= 1;
    
    int surfeit = size - N2_prev;

    /* Perform a combine-like operation */
    if (rank < N2_prev) {
        if (rank < surfeit) {
            /* get the fanin letter from the upper "half" process: */
            dst = N2_prev + rank;
            smpi_mpi_recv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER,
                                     comm, MPI_STATUS_IGNORE);
        }

        /* combine on embedded N2_prev power-of-two processes */
        for (d = 1; d < N2_prev; d <<= 1) {
            dst = (rank ^ d);
            smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, NULL,
                                 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm,
                                 MPI_STATUS_IGNORE);
        }

        /* fanout data to nodes above N2_prev... */
        if (rank < surfeit) {
            dst = N2_prev + rank;
            smpi_mpi_send(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER,
                                     comm);
        }
    } else {
        /* fanin data to power of 2 subset */
        src = rank - N2_prev;
        smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER,
                                     NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER,
                                     comm, MPI_STATUS_IGNORE);
    }

    return mpi_errno;

}
Example #17
0
// Allgather - gather/bcast algorithm
int smpi_coll_tuned_allgather_GB(void *send_buff, int send_count,
                                 MPI_Datatype send_type, void *recv_buff,
                                 int recv_count, MPI_Datatype recv_type,
                                 MPI_Comm comm)
{
  int num_procs;
  num_procs = smpi_comm_size(comm);
  smpi_mpi_gather(send_buff, send_count, send_type, recv_buff, recv_count, recv_type,
             0, comm);
  mpi_coll_bcast_fun(recv_buff, (recv_count * num_procs), recv_type, 0, comm);

  return MPI_SUCCESS;
}
/*****************************************************************************
 * Function: allgather_spreading_simple
 * return: int
 *  inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Let i -> j denote the communication from node i to node j. The
 *         order of communications for node i is i -> i + 1, i -> i + 2, ...,
 *         i -> (i + p -1) % P.
 *
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgather_spreading_simple(void *send_buff, int send_count,
                                           MPI_Datatype send_type,
                                           void *recv_buff, int recv_count,
                                           MPI_Datatype recv_type,
                                           MPI_Comm comm)
{
  MPI_Request *reqs, *req_ptr;
  MPI_Aint extent;
  int i, src, dst, rank, num_procs, num_reqs;
  int tag = COLL_TAG_ALLGATHER;
  MPI_Status status;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(send_type);

  num_reqs = (2 * num_procs) - 2;
  reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));
  if (!reqs) {
    printf("allgather-spreading-simple.c:40: cannot allocate memory\n");
    MPI_Finalize();
    exit(0);
  }

  req_ptr = reqs;
  smpi_mpi_sendrecv(send_buff, send_count, send_type, rank, tag,
               (char *) recv_buff + rank * recv_count * extent, recv_count,
               recv_type, rank, tag, comm, &status);

  for (i = 0; i < num_procs; i++) {
    src = (rank + i) % num_procs;
    if (src == rank)
      continue;
    *(req_ptr++) = smpi_mpi_irecv(recv_ptr + src * recv_count * extent, recv_count, recv_type,
              src, tag, comm);
  }

  for (i = 0; i < num_procs; i++) {
    dst = (rank + i) % num_procs;
    if (dst == rank)
      continue;
    *(req_ptr++) = smpi_mpi_isend(send_buff, send_count, send_type, dst, tag, comm);
  }

  smpi_mpi_waitall(num_reqs, reqs, MPI_STATUSES_IGNORE);
  free(reqs);

  return MPI_SUCCESS;
}
Example #19
0
int smpi_coll_tuned_allgatherv_ompi(void *sbuf, int scount, 
                                               MPI_Datatype sdtype,
                                               void* rbuf, int *rcounts, 
                                               int *rdispls,
                                               MPI_Datatype rdtype, 
                                               MPI_Comm  comm
                                               )
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;
    
    communicator_size = smpi_comm_size(comm);
    
    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return smpi_coll_tuned_allgatherv_pair(sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype, 
                                                           comm);
    }
    
    /* Determine complete data size */
    dsize=smpi_datatype_size(sdtype);
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }
    
    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
/*        return smpi_coll_tuned_allgatherv_intra_bruck(sbuf, scount, sdtype, 
                                                      rbuf, rcounts, rdispls, rdtype, 
                                                      comm, module);*/
    return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
                                                      rbuf, rcounts, rdispls, rdtype, 
                                                      comm);

    } else {
        if (communicator_size % 2) {
            return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, 
                                                         rbuf, rcounts, rdispls, rdtype, 
                                                         comm);
        } else {
            return  smpi_coll_tuned_allgatherv_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype, 
                                                                      comm);
        }
    }
}
/*****************************************************************************

 * Function: alltoall_pair_light_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j exchanges data
           with node i ^ j. Light barriers are inserted between
           communications in different phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_pair_light_barrier(void *send_buff, int send_count,
                                            MPI_Datatype send_type,
                                            void *recv_buff, int recv_count,
                                            MPI_Datatype recv_type,
                                            MPI_Comm comm)
{
  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs, next_partner;
  int tag = COLL_TAG_ALLTOALL;     /*, failure = 0; */

  char send_sync = 'a', recv_sync = 'b';
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! ");

  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  send_chunk *= send_count;
  recv_chunk *= recv_count;

  smpi_mpi_sendrecv(send_ptr + rank * send_chunk, send_count, send_type, rank, tag,
               recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
               comm, &s);

  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;

    smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
                 dst, tag, recv_ptr + src * recv_chunk, recv_count,
                 recv_type, src, tag, comm, &s);

    if ((i + 1) < num_procs) {
      next_partner = rank ^ (i + 1);
      smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
                   &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s);
    }
  }
  return MPI_SUCCESS;
}
Example #21
0
/*
 * Another recursive doubling type algorithm, but in this case
 * we go up the tree and back down the tree.  
 */
int smpi_coll_tuned_barrier_ompi_tree(MPI_Comm comm)
{
    int rank, size, depth;
    int jump, partner;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_tree %d", 
                 rank);

    /* Find the nearest power of 2 of the communicator size. */
    for(depth = 1; depth < size; depth <<= 1 );

    for (jump=1; jump<depth; jump<<=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, 
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            } else if (partner < rank) {
                smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            }
        }
    }
    
    depth>>=1;
    for (jump = depth; jump>0; jump>>=1) {
        partner = rank ^ jump;
        if (!(partner & (jump-1)) && partner < size) {
            if (partner > rank) {
                smpi_mpi_send (NULL, 0, MPI_BYTE, partner,
                                         COLL_TAG_BARRIER,
                                          comm);
            } else if (partner < rank) {
                smpi_mpi_recv (NULL, 0, MPI_BYTE, partner, 
                                         COLL_TAG_BARRIER, comm,
                                         MPI_STATUS_IGNORE);
            }
        }
    }

    return MPI_SUCCESS;
}
Example #22
0
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
int
smpi_coll_tuned_allgather_NTSLR(void *sbuf, int scount, MPI_Datatype stype,
                                void *rbuf, int rcount, MPI_Datatype rtype,
                                MPI_Comm comm)
{
  MPI_Aint rextent, sextent;
  MPI_Status status;
  int i, to, from, rank, size;
  int send_offset, recv_offset;
  int tag = COLL_TAG_ALLGATHER;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  rextent = smpi_datatype_get_extent(rtype);
  sextent = smpi_datatype_get_extent(stype);

  // irregular case use default MPI fucntions
  if (scount * sextent != rcount * rextent) {
    XBT_WARN("MPI_allgather_NTSLR use default MPI_allgather.");  
    smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;    
  }

  // topo non-specific
  to = (rank + 1) % size;
  from = (rank + size - 1) % size;

  //copy a single segment from sbuf to rbuf
  send_offset = rank * scount * sextent;

  smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + send_offset, rcount, rtype, rank, tag,
               comm, &status);


  //start sending logical ring message
  int increment = scount * sextent;
  for (i = 0; i < size - 1; i++) {
    send_offset = ((rank - i + size) % size) * increment;
    recv_offset = ((rank - i - 1 + size) % size) * increment;
    smpi_mpi_sendrecv((char *) rbuf + send_offset, scount, stype, to, tag + i,
                 (char *) rbuf + recv_offset, rcount, rtype, from, tag + i,
                 comm, &status);
  }

  return MPI_SUCCESS;
}
Example #23
0
int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
{
    int i;
    int size = smpi_comm_size(comm);
    int rank = smpi_comm_rank(comm);

    /* All non-root send & receive zero-length message. */

    if (rank > 0) {
        smpi_mpi_send (NULL, 0, MPI_BYTE, 0, 
                                 COLL_TAG_BARRIER,
                                  comm);

        smpi_mpi_recv (NULL, 0, MPI_BYTE, 0, 
                                 COLL_TAG_BARRIER,
                                 comm, MPI_STATUS_IGNORE);
    }

    /* The root collects and broadcasts the messages. */

    else {
        MPI_Request* requests;

        requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
        for (i = 1; i < size; ++i) {
            requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
                                     COLL_TAG_BARRIER, comm
                                     );
        }
        smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );

        for (i = 1; i < size; ++i) {
            requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
                                     COLL_TAG_BARRIER,
                                      comm
                                     );
        }
        smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
        free( requests );
    }

    /* All done */

    return MPI_SUCCESS;

}
Example #24
0
// Allgather - gather/bcast algorithm
int smpi_coll_tuned_allgatherv_GB(void *send_buff, int send_count,
                                  MPI_Datatype send_type, void *recv_buff,
                                  int *recv_counts, int *recv_disps, MPI_Datatype recv_type,
                                  MPI_Comm comm)
{
    smpi_mpi_gatherv(send_buff, send_count, send_type, recv_buff, recv_counts,
                     recv_disps, recv_type, 0, comm);
    int num_procs, i, current, max = 0;
    num_procs = smpi_comm_size(comm);
    for (i = 0; i < num_procs; i++) {
        current = recv_disps[i] + recv_counts[i];
        if (current > max)
            max = current;
    }
    mpi_coll_bcast_fun(recv_buff, max, recv_type, 0, comm);

    return MPI_SUCCESS;
}
Example #25
0
int smpi_coll_tuned_reduce_ompi_pipeline( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm  )
{

    uint32_t segsize;
    int segcount = count;
    size_t typelng;
//    COLL_TUNED_UPDATE_PIPELINE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a4 =  0.0033 / 1024.0; /* [1/B] */
    const double b4 =  1.6761;
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 

    if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K 
        segsize = 1024;
    }else if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K 
        segsize = 32*1024;
    } else {
        // Pipeline_64K 
        segsize = 64*1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_pipeline rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm,
                                           ompi_coll_tuned_topo_build_chain( 1, comm, root), 
                                           segcount, 0);
}
Example #26
0
int smpi_coll_tuned_scatter_ompi(void *sbuf, int scount, 
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount, 
                                            MPI_Datatype rdtype, 
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_scatter_ompi");

    communicator_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    // Determine block size 
    if (root == rank) {
        dsize=smpi_datatype_size(sdtype);
        block_size = dsize * scount;
    } else {
        dsize=smpi_datatype_size(rdtype);
        block_size = dsize * rcount;
    } 

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
        if(rank!=root){
            sbuf=xbt_malloc(rcount*smpi_datatype_get_extent(rdtype));
            scount=rcount;
            sdtype=rdtype;
        }
        int ret=smpi_coll_tuned_scatter_ompi_binomial (sbuf, scount, sdtype,
            rbuf, rcount, rdtype,
            root, comm);
        if(rank!=root){
            xbt_free(sbuf);
        }
        return ret;
    }
    return smpi_coll_tuned_scatter_ompi_basic_linear (sbuf, scount, sdtype, 
                                                       rbuf, rcount, rdtype, 
                                                       root, comm);
}
Example #27
0
int smpi_coll_tuned_barrier_ompi(MPI_Comm  comm)
{    int communicator_size = smpi_comm_size(comm);

    if( 2 == communicator_size )
        return smpi_coll_tuned_barrier_ompi_two_procs(comm);
/*     * Basic optimisation. If we have a power of 2 number of nodes*/
/*     * the use the recursive doubling algorithm, otherwise*/
/*     * bruck is the one we want.*/
    {
        int has_one = 0;
        for( ; communicator_size > 0; communicator_size >>= 1 ) {
            if( communicator_size & 0x1 ) {
                if( has_one )
                    return smpi_coll_tuned_barrier_ompi_bruck(comm);
                has_one = 1;
            }
        }
    }
    return smpi_coll_tuned_barrier_ompi_recursivedoubling(comm);
}
Example #28
0
int smpi_coll_tuned_allreduce_ompi(void *sbuf, void *rbuf, int count,
                        MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
    size_t dsize, block_dsize;
    int comm_size = smpi_comm_size(comm);
    const size_t intermediate_message = 10000;

    /**
     * Decision function based on MX results from the Grig cluster at UTK.
     * 
     * Currently, linear, recursive doubling, and nonoverlapping algorithms 
     * can handle both commutative and non-commutative operations.
     * Ring algorithm does not support non-commutative operations.
     */
    dsize = smpi_datatype_size(dtype);
    block_dsize = dsize * count;

    if (block_dsize < intermediate_message) {
        return (smpi_coll_tuned_allreduce_rdb (sbuf, rbuf, 
                                                                   count, dtype,
                                                                   op, comm));
    } 

    if( smpi_op_is_commute(op) && (count > comm_size) ) {
        const size_t segment_size = 1 << 20; /* 1 MB */
        if ((comm_size * segment_size >= block_dsize)) {
            //FIXME: ok, these are not the right algorithms, try to find closer ones
            // lr is a good match for allreduce_ring (difference is mainly the use of sendrecv)
            return smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype,
                                              op, comm);
        } else {
           return (smpi_coll_tuned_allreduce_ompi_ring_segmented (sbuf, rbuf,
                                                                    count, dtype, 
                                                                    op, comm 
                                                                    /*segment_size*/));
        }
    }

    return (smpi_coll_tuned_allreduce_redbcast(sbuf, rbuf, count, 
                                                            dtype, op, comm));
}
Example #29
0
int smpi_coll_tuned_reduce_ompi_binomial( void *sendbuf, void *recvbuf,
                                           int count, MPI_Datatype datatype,
                                           MPI_Op  op, int root,
                                           MPI_Comm  comm)
{

    uint32_t segsize=0;
    int segcount = count;
    size_t typelng;

    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;

//    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng= smpi_datatype_size( datatype);
    int communicator_size = smpi_comm_size(comm);
    size_t message_size = typelng * count; 
    if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        segsize = 0;
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K 
        segsize = 1024;
    }

    XBT_DEBUG("coll:tuned:reduce_intra_binomial rank %d ss %5d",
                 smpi_comm_rank(comm), segsize);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    return smpi_coll_tuned_ompi_reduce_generic( sendbuf, recvbuf, count, datatype, 
                                           op, root, comm, 
                                           ompi_coll_tuned_topo_build_in_order_bmtree(comm, root), 
                                           segcount, 0);
}
/*****************************************************************************

 * Function: alltoall_pair_light_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j exchanges data
           with node i ^ j. Light barriers are inserted between
           communications in different phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoallv_pair_light_barrier(void *send_buff, int *send_counts, int *send_disps,
                                            MPI_Datatype send_type,
                                            void *recv_buff, int *recv_counts, int *recv_disps,
                                            MPI_Datatype recv_type,
                                            MPI_Comm comm)
{
  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs, next_partner;
  int tag = COLL_TAG_ALLTOALLV;     /*, failure = 0; */

  char send_sync = 'a', recv_sync = 'b';
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  smpi_mpi_sendrecv(send_ptr + send_disps[rank] * send_chunk, send_counts[rank], send_type, rank, tag,
               recv_ptr + recv_disps[rank] * recv_chunk, recv_counts[rank], recv_type, rank, tag,
               comm, &s);

  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;

    smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type,
                 dst, tag, recv_ptr + recv_disps[src] *recv_chunk, recv_counts[dst],
                 recv_type, src, tag, comm, &s);

    if ((i + 1) < num_procs) {
      next_partner = rank ^ (i + 1);
      smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
                   &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s);
    }
  }
  return MPI_SUCCESS;
}