Пример #1
0
/*****************************************************************************
 * Function: allgather_ring
 * return: int
 * inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j+ i.
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgather_ring(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int recv_count, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  int i, src, dst, rank, num_procs;
  int tag = 1;
  MPI_Status status;

  char *sendptr = (char *) send_buff;
  char *recvptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(send_type);

  // local send/recv
  smpi_mpi_sendrecv(sendptr, send_count, send_type, rank, tag,
               recvptr + rank * recv_count * extent,
               recv_count, recv_type, rank, tag, comm, &status);

  for (i = 1; i < num_procs; i++) {
    src = (rank - i + num_procs) % num_procs;
    dst = (rank + i) % num_procs;
    smpi_mpi_sendrecv(sendptr, send_count, send_type, dst, tag,
                 recvptr + src * recv_count * extent, recv_count, recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
Пример #2
0
int smpi_coll_tuned_barrier_mvapich2_pair(MPI_Comm comm)
{

    int size, rank;
    int d, dst, src;
    int mpi_errno = MPI_SUCCESS;

    size = smpi_comm_size(comm);
    /* Trivial barriers return immediately */
    if (size == 1)
        return MPI_SUCCESS;

    rank =  smpi_comm_rank(comm);
    int N2_prev = 1;
    /*  N2_prev = greatest power of two < size of Comm  */
    for( N2_prev = 1; N2_prev <= size; N2_prev <<= 1 );
    N2_prev >>= 1;
    
    int surfeit = size - N2_prev;

    /* Perform a combine-like operation */
    if (rank < N2_prev) {
        if (rank < surfeit) {
            /* get the fanin letter from the upper "half" process: */
            dst = N2_prev + rank;
            smpi_mpi_recv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER,
                                     comm, MPI_STATUS_IGNORE);
        }

        /* combine on embedded N2_prev power-of-two processes */
        for (d = 1; d < N2_prev; d <<= 1) {
            dst = (rank ^ d);
            smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER, NULL,
                                 0, MPI_BYTE, dst, COLL_TAG_BARRIER, comm,
                                 MPI_STATUS_IGNORE);
        }

        /* fanout data to nodes above N2_prev... */
        if (rank < surfeit) {
            dst = N2_prev + rank;
            smpi_mpi_send(NULL, 0, MPI_BYTE, dst, COLL_TAG_BARRIER,
                                     comm);
        }
    } else {
        /* fanin data to power of 2 subset */
        src = rank - N2_prev;
        smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER,
                                     NULL, 0, MPI_BYTE, src, COLL_TAG_BARRIER,
                                     comm, MPI_STATUS_IGNORE);
    }

    return mpi_errno;

}
Пример #3
0
/*****************************************************************************

 * Function: alltoall_ring

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j - i -> j -> j + i.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_ring(void *send_buff, int send_count,
                              MPI_Datatype send_type, void *recv_buff,
                              int recv_count, MPI_Datatype recv_type,
                              MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALL;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  send_chunk *= send_count;
  recv_chunk *= recv_count;

  for (i = 0; i < num_procs; i++) {
    src = (rank - i + num_procs) % num_procs;
    dst = (rank + i) % num_procs;

    smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
                 tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
                 src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
Пример #4
0
int smpi_coll_tuned_barrier_ompi_bruck(MPI_Comm comm
					)
{
    int rank, size;
    int distance, to, from;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_bruck rank %d", rank);

    /* exchange data with rank-2^k and rank+2^k */
    for (distance = 1; distance < size; distance <<= 1) { 
        from = (rank + size - distance) % size;
        to   = (rank + distance) % size;

        /* send message to lower ranked node */
        smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, to, 
                                              COLL_TAG_BARRIER,
                                              NULL, 0, MPI_BYTE, from, 
                                              COLL_TAG_BARRIER,
                                              comm, MPI_STATUS_IGNORE);
    }

    return MPI_SUCCESS;

}
Пример #5
0
/*****************************************************************************

 * Function: alltoall_pair_mpi_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works when P is power of two. In each phase of P - 1
           phases, nodes in pair communicate their data. MPI barriers are
           inserted between each two phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoallv_pair_mpi_barrier(void *send_buff, int *send_counts, int *send_disps,
                                          MPI_Datatype send_type,
                                          void *recv_buff, int *recv_counts, int *recv_disps,
                                          MPI_Datatype recv_type, MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = 101;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  for (i = 0; i < num_procs; i++) {
    src = dst = rank ^ i;
    smpi_mpi_barrier(comm);
    smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst,
                 tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type,
                 src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
Пример #6
0
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
int
smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype,
                                   void *rbuf, int rcount, MPI_Datatype rtype,
                                   MPI_Comm comm)
{
    MPI_Aint rextent, sextent;
    MPI_Status status, status2;
    int i, to, from, rank, size;
    int send_offset, recv_offset;
    int tag = COLL_TAG_ALLGATHER;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    rextent = smpi_datatype_get_extent(rtype);
    sextent = smpi_datatype_get_extent(stype);
    MPI_Request *rrequest_array;
    MPI_Request *srequest_array;
    rrequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));
    srequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));

    // irregular case use default MPI fucntions
    if (scount * sextent != rcount * rextent) {
        XBT_WARN("MPI_allgather_NTSLR_NB use default MPI_allgather.");
        smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
        return MPI_SUCCESS;
    }

    // topo non-specific
    to = (rank + 1) % size;
    from = (rank + size - 1) % size;

    //copy a single segment from sbuf to rbuf
    send_offset = rank * scount * sextent;

    smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
                      (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status);


    //start sending logical ring message
    int increment = scount * sextent;

    //post all irecv first
    for (i = 0; i < size - 1; i++) {
        recv_offset = ((rank - i - 1 + size) % size) * increment;
        rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm);
    }


    for (i = 0; i < size - 1; i++) {
        send_offset = ((rank - i + size) % size) * increment;
        srequest_array[i] = smpi_mpi_isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm);
        smpi_mpi_wait(&rrequest_array[i], &status);
        smpi_mpi_wait(&srequest_array[i], &status2);
    }

    free(rrequest_array);
    free(srequest_array);

    return MPI_SUCCESS;
}
/*****************************************************************************

 * Function: alltoall_pair_light_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j exchanges data
           with node i ^ j. Light barriers are inserted between
           communications in different phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_pair_light_barrier(void *send_buff, int send_count,
                                            MPI_Datatype send_type,
                                            void *recv_buff, int recv_count,
                                            MPI_Datatype recv_type,
                                            MPI_Comm comm)
{
  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs, next_partner;
  int tag = COLL_TAG_ALLTOALL;     /*, failure = 0; */

  char send_sync = 'a', recv_sync = 'b';
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! ");

  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  send_chunk *= send_count;
  recv_chunk *= recv_count;

  smpi_mpi_sendrecv(send_ptr + rank * send_chunk, send_count, send_type, rank, tag,
               recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
               comm, &s);

  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;

    smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type,
                 dst, tag, recv_ptr + src * recv_chunk, recv_count,
                 recv_type, src, tag, comm, &s);

    if ((i + 1) < num_procs) {
      next_partner = rank ^ (i + 1);
      smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
                   &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s);
    }
  }
  return MPI_SUCCESS;
}
Пример #8
0
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
int
smpi_coll_tuned_allgather_NTSLR(void *sbuf, int scount, MPI_Datatype stype,
                                void *rbuf, int rcount, MPI_Datatype rtype,
                                MPI_Comm comm)
{
  MPI_Aint rextent, sextent;
  MPI_Status status;
  int i, to, from, rank, size;
  int send_offset, recv_offset;
  int tag = COLL_TAG_ALLGATHER;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  rextent = smpi_datatype_get_extent(rtype);
  sextent = smpi_datatype_get_extent(stype);

  // irregular case use default MPI fucntions
  if (scount * sextent != rcount * rextent) {
    XBT_WARN("MPI_allgather_NTSLR use default MPI_allgather.");  
    smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;    
  }

  // topo non-specific
  to = (rank + 1) % size;
  from = (rank + size - 1) % size;

  //copy a single segment from sbuf to rbuf
  send_offset = rank * scount * sextent;

  smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + send_offset, rcount, rtype, rank, tag,
               comm, &status);


  //start sending logical ring message
  int increment = scount * sextent;
  for (i = 0; i < size - 1; i++) {
    send_offset = ((rank - i + size) % size) * increment;
    recv_offset = ((rank - i - 1 + size) % size) * increment;
    smpi_mpi_sendrecv((char *) rbuf + send_offset, scount, stype, to, tag + i,
                 (char *) rbuf + recv_offset, rcount, rtype, from, tag + i,
                 comm, &status);
  }

  return MPI_SUCCESS;
}
Пример #9
0
int
smpi_coll_tuned_reduce_flat_tree(void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = 4321;
  int size;
  int rank;
  MPI_Aint extent;
  char *origin = 0;
  char *inbuf;
  MPI_Status status;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);

  /* If not root, send data to the root. */
  extent = smpi_datatype_get_extent(dtype);

  if (rank != root) {
    smpi_mpi_send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = (char *) xbt_malloc(count * extent);


  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    smpi_mpi_sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    smpi_mpi_recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = sbuf;
    else {
      smpi_mpi_recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    smpi_op_apply(op, inbuf, rbuf, &count, &dtype);

  }

  if (origin)
    free(origin);

  /* All done */
  return 0;
}
Пример #10
0
/*****************************************************************************

 * Function: alltoall_pair_light_barrier

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works in P - 1 steps. In step i, node j exchanges data
           with node i ^ j. Light barriers are inserted between
           communications in different phases.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoallv_pair_light_barrier(void *send_buff, int *send_counts, int *send_disps,
                                            MPI_Datatype send_type,
                                            void *recv_buff, int *recv_counts, int *recv_disps,
                                            MPI_Datatype recv_type,
                                            MPI_Comm comm)
{
  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs, next_partner;
  int tag = COLL_TAG_ALLTOALLV;     /*, failure = 0; */

  char send_sync = 'a', recv_sync = 'b';
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  smpi_mpi_sendrecv(send_ptr + send_disps[rank] * send_chunk, send_counts[rank], send_type, rank, tag,
               recv_ptr + recv_disps[rank] * recv_chunk, recv_counts[rank], recv_type, rank, tag,
               comm, &s);

  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;

    smpi_mpi_sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type,
                 dst, tag, recv_ptr + recv_disps[src] *recv_chunk, recv_counts[dst],
                 recv_type, src, tag, comm, &s);

    if ((i + 1) < num_procs) {
      next_partner = rank ^ (i + 1);
      smpi_mpi_sendrecv(&send_sync, 1, MPI_CHAR, next_partner, tag,
                   &recv_sync, 1, MPI_CHAR, next_partner, tag, comm, &s);
    }
  }
  return MPI_SUCCESS;
}
/*****************************************************************************
 * Function: allgather_spreading_simple
 * return: int
 *  inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Let i -> j denote the communication from node i to node j. The
 *         order of communications for node i is i -> i + 1, i -> i + 2, ...,
 *         i -> (i + p -1) % P.
 *
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgather_spreading_simple(void *send_buff, int send_count,
                                           MPI_Datatype send_type,
                                           void *recv_buff, int recv_count,
                                           MPI_Datatype recv_type,
                                           MPI_Comm comm)
{
  MPI_Request *reqs, *req_ptr;
  MPI_Aint extent;
  int i, src, dst, rank, num_procs, num_reqs;
  int tag = COLL_TAG_ALLGATHER;
  MPI_Status status;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(send_type);

  num_reqs = (2 * num_procs) - 2;
  reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));
  if (!reqs) {
    printf("allgather-spreading-simple.c:40: cannot allocate memory\n");
    MPI_Finalize();
    exit(0);
  }

  req_ptr = reqs;
  smpi_mpi_sendrecv(send_buff, send_count, send_type, rank, tag,
               (char *) recv_buff + rank * recv_count * extent, recv_count,
               recv_type, rank, tag, comm, &status);

  for (i = 0; i < num_procs; i++) {
    src = (rank + i) % num_procs;
    if (src == rank)
      continue;
    *(req_ptr++) = smpi_mpi_irecv(recv_ptr + src * recv_count * extent, recv_count, recv_type,
              src, tag, comm);
  }

  for (i = 0; i < num_procs; i++) {
    dst = (rank + i) % num_procs;
    if (dst == rank)
      continue;
    *(req_ptr++) = smpi_mpi_isend(send_buff, send_count, send_type, dst, tag, comm);
  }

  smpi_mpi_waitall(num_reqs, reqs, MPI_STATUSES_IGNORE);
  free(reqs);

  return MPI_SUCCESS;
}
Пример #12
0
/*****************************************************************************
 * Function: allgather_pair
 * return: int
 *  inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Function works when P is power of two. In each phase of P - 1
 *         phases, nodes in pair communicate their data.
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgatherv_pair(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int *recv_counts, int *recv_disps, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLGATHERV;
  MPI_Status status;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "allgatherv pair algorithm can't be used with non power of two number of processes ! ");

  extent = smpi_datatype_get_extent(send_type);

  // local send/recv
  smpi_mpi_sendrecv(send_ptr, send_count, send_type, rank, tag,
               recv_ptr + recv_disps[rank] * extent,
               recv_counts[rank], recv_type, rank, tag, comm, &status);
  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;
    smpi_mpi_sendrecv(send_ptr, send_count, send_type, dst, tag,
                 recv_ptr + recv_disps[src] * extent, recv_counts[src], recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
Пример #13
0
/* special case for two processes */
int smpi_coll_tuned_barrier_ompi_two_procs(MPI_Comm comm
					    )
{
    int remote;

    remote = smpi_comm_rank(comm);
    XBT_DEBUG(
                 "ompi_coll_tuned_barrier_ompi_two_procs rank %d", remote);
    remote = (remote + 1) & 0x1;

    smpi_mpi_sendrecv(NULL, 0, MPI_BYTE, remote, 
                                          COLL_TAG_BARRIER,
                                          NULL, 0, MPI_BYTE, remote, 
                                          COLL_TAG_BARRIER,
                                          comm, MPI_STATUS_IGNORE);
    return (MPI_SUCCESS);
}
Пример #14
0
/*****************************************************************************

 * Function: alltoall_pair

 * Return: int

 * Inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function works when P is power of two. In each phase of P - 1
           phases, nodes in pair communicate their data.

 * Auther: Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_pair_one_barrier(void *send_buff, int send_count,
        MPI_Datatype send_type,
        void *recv_buff, int recv_count,
        MPI_Datatype recv_type, MPI_Comm comm)
{

    MPI_Aint send_chunk, recv_chunk;
    MPI_Status s;
    int i, src, dst, rank, num_procs;
    int tag = COLL_TAG_ALLTOALL;

    char *send_ptr = (char *) send_buff;
    char *recv_ptr = (char *) recv_buff;

    rank = smpi_comm_rank(comm);
    num_procs = smpi_comm_size(comm);

    if((num_procs&(num_procs-1)))
        THROWF(arg_error,0, "alltoall pair algorithm can't be used with non power of two number of processes ! ");

    send_chunk = smpi_datatype_get_extent(send_type);
    recv_chunk = smpi_datatype_get_extent(recv_type);

    send_chunk *= send_count;
    recv_chunk *= recv_count;

    mpi_coll_barrier_fun(comm);
    for (i = 0; i < num_procs; i++) {
        src = dst = rank ^ i;
        smpi_mpi_sendrecv(send_ptr + dst * send_chunk, send_count, send_type, dst,
                          tag, recv_ptr + src * recv_chunk, recv_count, recv_type,
                          src, tag, comm, &s);
    }

    return MPI_SUCCESS;
}
int 
smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
                                                  MPI_Datatype sdtype,
                                                  void* rbuf, int *rcounts, int *rdispls,
                                                  MPI_Datatype rdtype,
                                                  MPI_Comm comm)
{
    int line = -1;
    int rank, size;
    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  
    int i, even_rank;
    int err = 0;
    ptrdiff_t slb, rlb, sext, rext;
    char *tmpsend = NULL, *tmprecv = NULL;


    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    if (size % 2) {
        XBT_DEBUG(
                     "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                     size);
        return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
                                                     rbuf, rcounts, 
                                                     rdispls, rdtype,
                                                     comm);
    }

    XBT_DEBUG(
                 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);

    err = smpi_datatype_extent (sdtype, &slb, &sext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    err = smpi_datatype_extent (rdtype, &rlb, &rext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    /* Initialization step:
       - if send buffer is not MPI_IN_PLACE, copy send buffer to 
       the appropriate block of receive buffer
    */
    tmprecv = (char*) rbuf + rdispls[rank] * rext;
    if (MPI_IN_PLACE != sbuf) {
        tmpsend = (char*) sbuf;
        err = smpi_datatype_copy(tmpsend, scount, sdtype, 
                              tmprecv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
    } 

    /* Determine neighbors, order in which blocks will arrive, etc. */
    even_rank = !(rank % 2);
    if (even_rank) {
        neighbor[0] = (rank + 1) % size;
        neighbor[1] = (rank - 1 + size) % size;
        recv_data_from[0] = rank;
        recv_data_from[1] = rank;
        offset_at_step[0] = (+2);
        offset_at_step[1] = (-2);
    } else {
        neighbor[0] = (rank - 1 + size) % size;
        neighbor[1] = (rank + 1) % size;
        recv_data_from[0] = neighbor[0];
        recv_data_from[1] = neighbor[0];
        offset_at_step[0] = (-2);
        offset_at_step[1] = (+2);
    }

    /* Communication loop:
       - First step is special: exchange a single block with neighbor[0].
       - Rest of the steps: 
       update recv_data_from according to offset, and 
       exchange two blocks with appropriate neighbor.
       the send location becomes previous receve location.
       Note, we need to create indexed datatype to send and receive these
       blocks properly.
    */
    tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
    tmpsend = (char*)rbuf + rdispls[rank] * rext;
    smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   tmprecv, rcounts[neighbor[0]], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   comm, MPI_STATUS_IGNORE);



  
   
    /* Determine initial sending counts and displacements*/
    if (even_rank) {
        send_data_from = rank;
    } else {
        send_data_from = recv_data_from[0];
    }

    for (i = 1; i < (size / 2); i++) {
        MPI_Datatype new_rdtype, new_sdtype;
        int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
        const int i_parity = i % 2;
        recv_data_from[i_parity] = 
            (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

        /* Create new indexed types for sending and receiving.
           We are sending data from ranks (send_data_from) and (send_data_from+1)
           We are receiving data from ranks (recv_data_from[i_parity]) and
           (recv_data_from[i_parity]+1).
        */
        
        new_scounts[0] = rcounts[send_data_from];
        new_scounts[1] = rcounts[(send_data_from + 1)];
        new_sdispls[0] = rdispls[send_data_from];
        new_sdispls[1] = rdispls[(send_data_from + 1)];
        err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, 
                                      &new_sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_sdtype);

        new_rcounts[0] = rcounts[recv_data_from[i_parity]];
        new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
        new_rdispls[0] = rdispls[recv_data_from[i_parity]];
        new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
        err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, 
                                      &new_rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_rdtype);
      
        tmprecv = (char*)rbuf;
        tmpsend = (char*)rbuf;
      
        /* Sendreceive */
        smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       tmprecv, 1, new_rdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       comm, MPI_STATUS_IGNORE);

        send_data_from = recv_data_from[i_parity];
      
        smpi_datatype_free(&new_sdtype);
        smpi_datatype_free(&new_rdtype);
    }

    return MPI_SUCCESS;

 err_hndl:
    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank);
    return err;
}
Пример #16
0
int smpi_coll_tuned_allreduce_rdb(void *sbuff, void *rbuff, int count,
                                  MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
  int nprocs, rank, tag = COLL_TAG_ALLREDUCE;
  int mask, dst, pof2, newrank, rem, newdst;
  MPI_Aint extent, lb;
  MPI_Status status;
  void *tmp_buf = NULL;
  /*
     #ifdef MPICH2_REDUCTION
     MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
     #else
     MPI_User_function *uop;
     struct MPIR_OP *op_ptr;
     op_ptr = MPIR_ToPointer(op);
     uop  = op_ptr->op;
     #endif
   */
  nprocs=smpi_comm_size(comm);
  rank=smpi_comm_rank(comm);

  smpi_datatype_extent(dtype, &lb, &extent);
  tmp_buf = (void *) xbt_malloc(count * extent);

  smpi_mpi_sendrecv(sbuff, count, dtype, rank, 500,
               rbuff, count, dtype, rank, 500, comm, &status);

  // find nearest power-of-two less than or equal to comm_size
  pof2 = 1;
  while (pof2 <= nprocs)
    pof2 <<= 1;
  pof2 >>= 1;

  rem = nprocs - pof2;

  // In the non-power-of-two case, all even-numbered
  // processes of rank < 2*rem send their data to
  // (rank+1). These even-numbered processes no longer
  // participate in the algorithm until the very end. The
  // remaining processes form a nice power-of-two. 

  if (rank < 2 * rem) {
    // even       
    if (rank % 2 == 0) {

      smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm);

      // temporarily set the rank to -1 so that this
      // process does not pariticipate in recursive
      // doubling
      newrank = -1;
    } else                      // odd
    {
      smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
      // do the reduction on received data. since the
      // ordering is right, it doesn't matter whether
      // the operation is commutative or not.
      smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype);

      // change the rank 
      newrank = rank / 2;
    }
  }

  else                          // rank >= 2 * rem 
    newrank = rank - rem;

  // If op is user-defined or count is less than pof2, use
  // recursive doubling algorithm. Otherwise do a reduce-scatter
  // followed by allgather. (If op is user-defined,
  // derived datatypes are allowed and the user could pass basic
  // datatypes on one process and derived on another as long as
  // the type maps are the same. Breaking up derived
  // datatypes to do the reduce-scatter is tricky, therefore
  // using recursive doubling in that case.) 

  if (newrank != -1) {
    mask = 0x1;
    while (mask < pof2) {
      newdst = newrank ^ mask;
      // find real rank of dest 
      dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

      // Send the most current data, which is in recvbuf. Recv
      // into tmp_buf 
      smpi_mpi_sendrecv(rbuff, count, dtype, dst, tag, tmp_buf, count, dtype,
                   dst, tag, comm, &status);

      // tmp_buf contains data received in this step.
      // recvbuf contains data accumulated so far 

      // op is commutative OR the order is already right
      // we assume it is commuttive op
      //      if (op -> op_commute  || (dst < rank))
      if ((dst < rank)) {
        smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype);
      } else                    // op is noncommutative and the order is not right
      {
        smpi_op_apply(op, rbuff, tmp_buf, &count, &dtype);

        // copy result back into recvbuf
        smpi_mpi_sendrecv(tmp_buf, count, dtype, rank, tag, rbuff, count,
                     dtype, rank, tag, comm, &status);
      }
      mask <<= 1;
    }
  }
  // In the non-power-of-two case, all odd-numbered processes of
  // rank < 2 * rem send the result to (rank-1), the ranks who didn't
  // participate above.

  if (rank < 2 * rem) {
    if (rank % 2)               // odd 
      smpi_mpi_send(rbuff, count, dtype, rank - 1, tag, comm);
    else                        // even 
      smpi_mpi_recv(rbuff, count, dtype, rank + 1, tag, comm, &status);
  }

  free(tmp_buf);
  return MPI_SUCCESS;
}
/* Non-topology-specific pipelined linear-reduce function */
int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf,
                                                 int count,
                                                 MPI_Datatype datatype,
                                                 MPI_Op op, int root,
                                                 MPI_Comm comm)
{
  int rank;
  rank = smpi_comm_rank(comm);

  int tag = -COLL_TAG_REDUCE;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Status temp_status_array[MAX_NODE];

  int size;
  int i;

  int sent_count;
  int header_index;
  int flag_array[MAX_NODE];
  int already_received[MAX_NODE];

  int header_buf[HEADER_SIZE];
  char temp_buf[MAX_NODE];

  MPI_Aint extent, lb;
  smpi_datatype_extent(datatype, &lb, &extent);

  /* source and destination */
  int to, from;

  size=smpi_comm_size(comm);
  rank=smpi_comm_rank(comm);


  /* segment is segment size in number of elements (not bytes) */
  int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < MAX_NODE; i++) {
    already_received[i] = 0;
  }

  char *tmp_buf;
  tmp_buf = (char *) xbt_malloc(count * extent);

  smpi_mpi_sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank,
               tag, comm, &status);



  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {

    if (rank == 0) {
      sent_count = 0;

      while (sent_count < (size - 1)) {

        for (i = 1; i < size; i++) {
          if (already_received[i] == 0) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                             MPI_STATUSES_IGNORE);
            simcall_process_sleep(0.0001);
            }
        }

        header_index = 0;
        /* recv 1-byte message */
        for (i = 0; i < size; i++) {
          if (i == rank)
            continue;

          /* 1-byte message arrive */
          if ((flag_array[i] == 1) && (already_received[i] == 0)) {
            smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;


            //printf("root send to %d recv from %d : data = ",to,from);
            /*
               for (i=0;i<=header_index;i++) {
               printf("%d ",header_buf[i]);
               }
               printf("\n");
             */
            /* will receive in the next step */
            already_received[i] = 1;
          }
        }

        /* send header followed by receive and reduce data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];
          from = header_buf[header_index - 1];

          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
          smpi_mpi_recv(tmp_buf, count, datatype, from, tag, comm, &status);
          smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype);
        }
      }                         /* while loop */
    }

    /* root */
    /* non-root */
    else {

      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header and data, forward when required */
      smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
               &status);
      //      smpi_mpi_recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* forward header */
      if (header_buf[myordering + 1] != -1) {
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }
      //printf("node %d ordering %d\n",rank,myordering);

      /* receive, reduce, and forward data */

      /* send only */
      if (myordering == 0) {
        if (header_buf[myordering + 1] == -1) {
          to = 0;
        } else {
          to = header_buf[myordering + 1];
        }
        smpi_mpi_send(rbuf, count, datatype, to, tag, comm);
      }

      /* recv, reduce, send */
      else {
        if (header_buf[myordering + 1] == -1) {
          to = 0;
        } else {
          to = header_buf[myordering + 1];
        }
        from = header_buf[myordering - 1];
        smpi_mpi_recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag,
                 comm, &status);
        smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype);
        smpi_mpi_send(rbuf, count, datatype, to, tag, comm);
      }
    }                           /* non-root */
  }
  /* pipeline bcast */
  else {
    //    printf("node %d start\n",rank);

    send_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    if (rank == 0) {
      sent_count = 0;

      int will_send[MAX_NODE];
      for (i = 0; i < MAX_NODE; i++)
        will_send[i] = 0;

      /* loop until all data are received (sent) */
      while (sent_count < (size - 1)) {
        int k;
        for (k = 0; k < 1; k++) {
          for (i = 1; i < size; i++) {
            //if (i == rank)
            //continue;
            if ((already_received[i] == 0) && (will_send[i] == 0)) {
                smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                         &temp_status_array[i]);
              if (flag_array[i] == 1) {
                will_send[i] = 1;
                smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
                         &status);
                //printf("recv from %d\n",i);
                i = 1;
              }
            }
          }
        }                       /* end of probing */

        header_index = 0;

        /* recv 1-byte message */
        for (i = 1; i < size; i++) {
          //if (i==rank)
          //continue;
          /* message arrived in this round (put in the header) */
          if ((will_send[i] == 1) && (already_received[i] == 0)) {
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_received[i] = 1;
          }
        }

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];

          /* send header */
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

          /* recv data - pipeline */
          from = header_buf[header_index - 1];
          for (i = 0; i < pipe_length; i++) {
            smpi_mpi_recv(tmp_buf + (i * increment), segment, datatype, from, tag,
                     comm, &status);
            smpi_op_apply(op, tmp_buf + (i * increment),
                           (char *)rbuf + (i * increment), &segment, &datatype);
          }
        }
      }                         /* while loop (sent_count < size-1 ) */
    }

    /* root */
    /* none root */
    else {
      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);


      /* wait for header forward when required */
      request=smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm);
      smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

      /* search for where it is */
      int myordering = 0;

      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header when required */
      if (header_buf[myordering + 1] != -1) {
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }

      /* (receive, reduce), and send data */
      if (header_buf[myordering + 1] == -1) {
        to = 0;
      } else {
        to = header_buf[myordering + 1];
      }

      /* send only */
      if (myordering == 0) {
        for (i = 0; i < pipe_length; i++) {
            send_request_array[i]= smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }

      /* receive, reduce, and send */
      else {
        from = header_buf[myordering - 1];
        for (i = 0; i < pipe_length; i++) {
          recv_request_array[i]=smpi_mpi_irecv(tmp_buf + (i * increment), segment, datatype, from, tag, comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
          smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
                         &segment, &datatype);
          send_request_array[i]=smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }
    }                           /* non-root */




    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);

    //printf("node %d done\n",rank);
  }                             /* end pipeline */


  /* if root is not zero send root after finished
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == 0) {
      smpi_mpi_send(rbuf, count, datatype, root, tag, comm);
    } else if (rank == root) {
      smpi_mpi_recv(rbuf, count, datatype, 0, tag, comm, &status);
    }
  }


  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    smpi_mpi_reduce((char *)buf + (pipe_length * increment),
	       (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root,
               comm);
  }

  free(tmp_buf);

  return MPI_SUCCESS;
}
Пример #18
0
int smpi_coll_tuned_reduce_scatter_gather(void *sendbuf, void *recvbuf,
                                          int count, MPI_Datatype datatype,
                                          MPI_Op op, int root, MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank, pof2, rem, newrank;
  int mask, *cnts, *disps, i, j, send_idx = 0;
  int recv_idx, last_idx = 0, newdst;
  int dst, send_cnt, recv_cnt, newroot, newdst_tree_root;
  int newroot_tree_root, new_count;
  int tag = COLL_TAG_REDUCE;
  void *send_ptr, *recv_ptr, *tmp_buf;

  cnts = NULL;
  disps = NULL;

  MPI_Aint extent;

  if (count == 0)
    return 0;
  rank = smpi_comm_rank(comm);
  comm_size = smpi_comm_size(comm);

  extent = smpi_datatype_get_extent(datatype);

  /* find nearest power-of-two less than or equal to comm_size */
  pof2 = 1;
  while (pof2 <= comm_size)
    pof2 <<= 1;
  pof2 >>= 1;

  if (count < comm_size) {
    new_count = comm_size;
    send_ptr = (void *) xbt_malloc(new_count * extent);
    recv_ptr = (void *) xbt_malloc(new_count * extent);
    tmp_buf = (void *) xbt_malloc(new_count * extent);
    memcpy(send_ptr, sendbuf, extent * count);

    //if ((rank != root))
    smpi_mpi_sendrecv(send_ptr, new_count, datatype, rank, tag,
                 recv_ptr, new_count, datatype, rank, tag, comm, &status);

    rem = comm_size - pof2;
    if (rank < 2 * rem) {
      if (rank % 2 != 0) {
        /* odd */
        smpi_mpi_send(recv_ptr, new_count, datatype, rank - 1, tag, comm);
        newrank = -1;
      } else {
        smpi_mpi_recv(tmp_buf, count, datatype, rank + 1, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_ptr, &new_count, &datatype);
        newrank = rank / 2;
      }
    } else                      /* rank >= 2*rem */
      newrank = rank - rem;

    cnts = (int *) xbt_malloc(pof2 * sizeof(int));
    disps = (int *) xbt_malloc(pof2 * sizeof(int));

    if (newrank != -1) {
      for (i = 0; i < (pof2 - 1); i++)
        cnts[i] = new_count / pof2;
      cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1);

      disps[0] = 0;
      for (i = 1; i < pof2; i++)
        disps[i] = disps[i - 1] + cnts[i - 1];

      mask = 0x1;
      send_idx = recv_idx = 0;
      last_idx = pof2;
      while (mask < pof2) {
        newdst = newrank ^ mask;
        /* find real rank of dest */
        dst = (newdst < rem) ? newdst * 2 : newdst + rem;

        send_cnt = recv_cnt = 0;
        if (newrank < newdst) {
          send_idx = recv_idx + pof2 / (mask * 2);
          for (i = send_idx; i < last_idx; i++)
            send_cnt += cnts[i];
          for (i = recv_idx; i < send_idx; i++)
            recv_cnt += cnts[i];
        } else {
          recv_idx = send_idx + pof2 / (mask * 2);
          for (i = send_idx; i < recv_idx; i++)
            send_cnt += cnts[i];
          for (i = recv_idx; i < last_idx; i++)
            recv_cnt += cnts[i];
        }

        /* Send data from recvbuf. Recv into tmp_buf */
        smpi_mpi_sendrecv((char *) recv_ptr +
                     disps[send_idx] * extent,
                     send_cnt, datatype,
                     dst, tag,
                     (char *) tmp_buf +
                     disps[recv_idx] * extent,
                     recv_cnt, datatype, dst, tag, comm, &status);

        /* tmp_buf contains data received in this step.
           recvbuf contains data accumulated so far */

        smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                       (char *) recv_ptr + disps[recv_idx] * extent,
                       &recv_cnt, &datatype);

        /* update send_idx for next iteration */
        send_idx = recv_idx;
        mask <<= 1;

        if (mask < pof2)
          last_idx = recv_idx + pof2 / mask;
      }
    }

    /* now do the gather to root */

    if (root < 2 * rem) {
      if (root % 2 != 0) {
        if (rank == root) {
          /* recv */
          for (i = 0; i < (pof2 - 1); i++)
            cnts[i] = new_count / pof2;
          cnts[pof2 - 1] = new_count - (new_count / pof2) * (pof2 - 1);

          disps[0] = 0;
          for (i = 1; i < pof2; i++)
            disps[i] = disps[i - 1] + cnts[i - 1];

          smpi_mpi_recv(recv_ptr, cnts[0], datatype, 0, tag, comm, &status);

          newrank = 0;
          send_idx = 0;
          last_idx = 2;
        } else if (newrank == 0) {
          smpi_mpi_send(recv_ptr, cnts[0], datatype, root, tag, comm);
          newrank = -1;
        }
        newroot = 0;
      } else
        newroot = root / 2;
    } else
      newroot = root - rem;

    if (newrank != -1) {
      j = 0;
      mask = 0x1;
      while (mask < pof2) {
        mask <<= 1;
        j++;
      }
      mask >>= 1;
      j--;
      while (mask > 0) {
        newdst = newrank ^ mask;

        /* find real rank of dest */
        dst = (newdst < rem) ? newdst * 2 : newdst + rem;

        if ((newdst == 0) && (root < 2 * rem) && (root % 2 != 0))
          dst = root;
        newdst_tree_root = newdst >> j;
        newdst_tree_root <<= j;

        newroot_tree_root = newroot >> j;
        newroot_tree_root <<= j;

        send_cnt = recv_cnt = 0;
        if (newrank < newdst) {
          /* update last_idx except on first iteration */
          if (mask != pof2 / 2)
            last_idx = last_idx + pof2 / (mask * 2);

          recv_idx = send_idx + pof2 / (mask * 2);
          for (i = send_idx; i < recv_idx; i++)
            send_cnt += cnts[i];
          for (i = recv_idx; i < last_idx; i++)
            recv_cnt += cnts[i];
        } else {
          recv_idx = send_idx - pof2 / (mask * 2);
          for (i = send_idx; i < last_idx; i++)
            send_cnt += cnts[i];
          for (i = recv_idx; i < send_idx; i++)
            recv_cnt += cnts[i];
        }

        if (newdst_tree_root == newroot_tree_root) {
          smpi_mpi_send((char *) recv_ptr +
                   disps[send_idx] * extent,
                   send_cnt, datatype, dst, tag, comm);
          break;
        } else {
          smpi_mpi_recv((char *) recv_ptr +
                   disps[recv_idx] * extent,
                   recv_cnt, datatype, dst, tag, comm, &status);
        }

        if (newrank > newdst)
          send_idx = recv_idx;

        mask >>= 1;
        j--;
      }
    }
int 
smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count,
                                               MPI_Datatype dtype,
                                               MPI_Op op,
                                               MPI_Comm comm) 
{
   int ret = MPI_SUCCESS;
   int line;
   int k, recv_from, send_to;
   int early_blockcount, late_blockcount, split_rank; 
   int segcount, max_segcount;
   int num_phases, phase;
   int block_count;
   unsigned int inbi;
   size_t typelng;
   char *tmpsend = NULL, *tmprecv = NULL;
   char *inbuf[2] = {NULL, NULL};
   ptrdiff_t true_extent, extent;
   ptrdiff_t block_offset, max_real_segsize;
   MPI_Request reqs[2] = {NULL, NULL};
   const size_t segsize = 1 << 20; /* 1 MB */
   unsigned int size = smpi_comm_size(comm);
   unsigned int rank = smpi_comm_rank(comm);

   XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count);

   /* Special case for size == 1 */
   if (1 == size) {
      if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
      }
      return MPI_SUCCESS;
   }
   
   /* Determine segment count based on the suggested segment size */
   extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   true_extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   typelng = smpi_datatype_size(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   segcount = count;
   COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)

   /* Special case for count less than size * segcount - use regular ring */
   if (count < size * segcount) {
      XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count);
      return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, 
                                                   comm));
   }

   /* Determine the number of phases of the algorithm */
   num_phases = count / (size * segcount);
   if ((count % (size * segcount) >= size) && 
       (count % (size * segcount) > ((size * segcount) / 2))) {
      num_phases++;
   }

   /* Determine the number of elements per block and corresponding 
      block sizes.
      The blocks are divided into "early" and "late" ones:
      blocks 0 .. (split_rank - 1) are "early" and 
      blocks (split_rank) .. (size - 1) are "late".
      Early blocks are at most 1 element larger than the late ones.
      Note, these blocks will be split into num_phases segments,
      out of the largest one will have max_segcount elements.
    */
   COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, 
                                  early_blockcount, late_blockcount )
   COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
                                  max_segcount, k)
   max_real_segsize = true_extent + (max_segcount - 1) * extent;

   /* Allocate and initialize temporary buffers */
   inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize);
   if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
   if (size > 2) {
      inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize);
      if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
   }

   /* Handle MPI_IN_PLACE */
   if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
      if (ret < 0) { line = __LINE__; goto error_hndl; }
   }

   /* Computation loop: for each phase, repeat ring allreduce computation loop */
   for (phase = 0; phase < num_phases; phase ++) {
      ptrdiff_t phase_offset;
      int early_phase_segcount, late_phase_segcount, split_phase, phase_count;

      /* 
         For each of the remote nodes:
         - post irecv for block (r-1)
         - send block (r)
           To do this, first compute block offset and count, and use block offset
           to compute phase offset.
         - in loop for every step k = 2 .. n
           - post irecv for block (r + n - k) % n
           - wait on block (r + n - k + 1) % n to arrive
           - compute on block (r + n - k + 1) % n
           - send block (r + n - k + 1) % n
         - wait on block (r + 1)
         - compute on block (r + 1)
         - send block (r + 1) to rank (r + 1)
         Note that we must be careful when computing the begining of buffers and
         for send operations and computation we must compute the exact block size.
      */
      send_to = (rank + 1) % size;
      recv_from = (rank + size - 1) % size;
      
      inbi = 0;
      /* Initialize first receive from the neighbor on the left */
      reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
      /* Send first block (my block) to the neighbor on the right:
         - compute my block and phase offset
         - send data */
      block_offset = ((rank < split_rank)? 
                      (rank * early_blockcount) : 
                      (rank * late_blockcount + split_rank));
      block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_mpi_send(tmpsend, phase_count, dtype, send_to,
                              666, comm);
      
      for (k = 2; k < size; k++) {
         const int prevblock = (rank + size - k + 1) % size;
         
         inbi = inbi ^ 0x1;
         
         /* Post irecv for the current block */
         reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
         
         /* Wait on previous block to arrive */
         smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
         
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         block_offset = ((prevblock < split_rank)?
                         (prevblock * early_blockcount) :
                         (prevblock * late_blockcount + split_rank));
         block_count = ((prevblock < split_rank)? 
                        early_blockcount : late_blockcount);
         COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                       early_phase_segcount, late_phase_segcount)
         phase_count = ((phase < split_phase)?
                        (early_phase_segcount) : (late_phase_segcount));
         phase_offset = ((phase < split_phase)?
                         (phase * early_phase_segcount) : 
                         (phase * late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
         smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype);
         /* send previous block to send_to */
         smpi_mpi_send(tmprecv, phase_count, dtype, send_to,
                              666, comm);
      }
      
      /* Wait on the last block to arrive */
      smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE);

      
      /* Apply operation on the last block (from neighbor (rank + 1) 
         rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
      recv_from = (rank + 1) % size;
      block_offset = ((recv_from < split_rank)?
                      (recv_from * early_blockcount) :
                      (recv_from * late_blockcount + split_rank));
      block_count = ((recv_from < split_rank)? 
                     early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype);
   }

   /* Distribution loop - variation of ring allgather */
   send_to = (rank + 1) % size;
   recv_from = (rank + size - 1) % size;
   for (k = 0; k < size - 1; k++) {
      const int recv_data_from = (rank + size - k) % size;
      const int send_data_from = (rank + 1 + size - k) % size;
      const int send_block_offset = 
         ((send_data_from < split_rank)?
          (send_data_from * early_blockcount) :
          (send_data_from * late_blockcount + split_rank));
      const int recv_block_offset = 
         ((recv_data_from < split_rank)?
          (recv_data_from * early_blockcount) :
          (recv_data_from * late_blockcount + split_rank));
      block_count = ((send_data_from < split_rank)? 
                     early_blockcount : late_blockcount);

      tmprecv = (char*)rbuf + recv_block_offset * extent;
      tmpsend = (char*)rbuf + send_block_offset * extent;

      smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to,
                                     666,
                                     tmprecv, early_blockcount, dtype, recv_from,
                                     666,
                                     comm, MPI_STATUS_IGNORE);

   }

   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);

   return MPI_SUCCESS;

 error_hndl:
   XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n",
                __FILE__, line, rank, ret);
   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);
   return ret;
}
int 
smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
                                                 MPI_Datatype sdtype,
                                                 void* rbuf, int rcount,
                                                 MPI_Datatype rdtype,
                                                 MPI_Comm comm
)
{
   int line = -1;
   int rank, size;
   int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
   int i, even_rank;
   int err = 0;
   ptrdiff_t slb, rlb, sext, rext;
   char *tmpsend = NULL, *tmprecv = NULL;

   size = smpi_comm_size(comm);
   rank = smpi_comm_rank(comm);

   if (size % 2) {
      XBT_DEBUG(
                   "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                   size);
      return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
                                                  rbuf, rcount, rdtype,
                                                  comm);
   }

   XBT_DEBUG(
                "coll:tuned:allgather_intra_neighborexchange rank %d", rank);

   err = smpi_datatype_extent (sdtype, &slb, &sext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   err = smpi_datatype_extent (rdtype, &rlb, &rext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   /* Initialization step:
      - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
        of receive buffer
   */
   tmprecv = (char*) rbuf + rank * rcount * rext;
   if (MPI_IN_PLACE != sbuf) {
      tmpsend = (char*) sbuf;
      smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
   } 

   /* Determine neighbors, order in which blocks will arrive, etc. */
   even_rank = !(rank % 2);
   if (even_rank) {
      neighbor[0] = (rank + 1) % size;
      neighbor[1] = (rank - 1 + size) % size;
      recv_data_from[0] = rank;
      recv_data_from[1] = rank;
      offset_at_step[0] = (+2);
      offset_at_step[1] = (-2);
   } else {
      neighbor[0] = (rank - 1 + size) % size;
      neighbor[1] = (rank + 1) % size;
      recv_data_from[0] = neighbor[0];
      recv_data_from[1] = neighbor[0];
      offset_at_step[0] = (-2);
      offset_at_step[1] = (+2);
   }

   /* Communication loop:
      - First step is special: exchange a single block with neighbor[0].
      - Rest of the steps: 
        update recv_data_from according to offset, and 
        exchange two blocks with appropriate neighbor.
        the send location becomes previous receve location.
   */
   tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
   tmpsend = (char*)rbuf + rank * rcount * rext;
   /* Sendreceive */
   smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  tmprecv, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  comm, MPI_STATUS_IGNORE);

   /* Determine initial sending location */
   if (even_rank) {
      send_data_from = rank;
   } else {
      send_data_from = recv_data_from[0];
   }

   for (i = 1; i < (size / 2); i++) {
      const int i_parity = i % 2;
      recv_data_from[i_parity] = 
         (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

      tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
      tmpsend = (char*)rbuf + send_data_from * rcount * rext;
      
      /* Sendreceive */
      smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
                                     neighbor[i_parity], 
                                     COLL_TAG_ALLGATHER,
                                     tmprecv, 2 * rcount, rdtype,
                                     neighbor[i_parity],
                                     COLL_TAG_ALLGATHER,
                                     comm, MPI_STATUS_IGNORE);

      send_data_from = recv_data_from[i_parity];
   }

   return MPI_SUCCESS;

 err_hndl:
   XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
                __FILE__, line, err, rank);
   return err;
}
Пример #21
0
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm comm)
{
  int tag = -COLL_TAG_BCAST;//in order to use ANY_TAG, make this one positive
  int header_tag = 10;
  MPI_Status status;

  int curr_remainder;
  int curr_size;
  int curr_increment;
  int send_offset;
  int recv_offset;
  int send_count;
  int recv_count;

  MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int rank, size;
  int i, k;

  int sent_count;
  int header_index;
  int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
  char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
  int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);


  /* source and destination */
  int to, from;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);

  /* message too small */
  if (count < size) {
    XBT_WARN("MPI_bcast_arrival_scatter use default MPI_bcast.");
    smpi_mpi_bcast(buf, count, datatype, root, comm);
    return MPI_SUCCESS;        
  }



  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag - 1, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag - 1, comm, &status);
    }
  }


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < max_node; i++) {
    already_sent[i] = 0;
  }

  /* start bcast */

  /* root */
  if (rank == 0) {

    for (i = 0; i < max_node; i++)
      will_send[i] = 0;

    sent_count = 0;
    while (sent_count < (size - 1)) {

      for (k = 0; k < 3; k++) {
        for (i = 1; i < size; i++) {
          if ((already_sent[i] == 0) && (will_send[i] == 0)) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                       &temp_status_array[i]);
            if (flag_array[i] == 1) {
              will_send[i] = 1;
              smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                       &status);
              i = 0;
            }
          }
        }
      }
      header_index = 0;

      /* recv 1-byte message in this round */
      for (i = 1; i < size; i++) {
        /* message arrive */
        if ((will_send[i] == 1) && (already_sent[i] == 0)) {
          header_buf[header_index] = i;
          header_index++;
          sent_count++;

          /* will send in the next step */
          already_sent[i] = 1;
        }
      }

      /*
         if (header_index != 0) {
         printf("header index = %d node = ",header_index);
         for (i=0;i<header_index;i++) {
         printf("%d ",header_buf[i]);
         }
         printf("\n");
         }
       */

      /* send header followed by data */
      if (header_index != 0) {
        header_buf[header_index] = -1;

        /* send header */
        for (i = 0; i < header_index; i++) {
          to = header_buf[i];
          smpi_mpi_send(header_buf, header_size, MPI_INT, to, header_tag, comm);
        }

        curr_remainder = count % header_index;
        curr_size = (count / header_index);
        curr_increment = curr_size * extent;

        /* send data */

        for (i = 0; i < header_index; i++) {
          to = header_buf[i];
          if ((i == (header_index - 1)) || (curr_size == 0))
            curr_size += curr_remainder;
          //printf("Root send to %d index %d\n",to,(i*curr_increment));
          smpi_mpi_send((char *) buf + (i * curr_increment), curr_size, datatype, to,
                   tag, comm);
        }
      }
    }                           /* while (sent_count < size-1) */
  }

  /* rank 0 */
  /* none root */
  else {
    /* send 1-byte message to root */
    smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

    /* wait for header forward when required */
    smpi_mpi_recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status);

    /* search for where it is */
    int myordering = 0;
    while (rank != header_buf[myordering]) {
      myordering++;
    }

    int total_nodes = 0;
    while (header_buf[total_nodes] != -1) {
      total_nodes++;
    }

    curr_remainder = count % total_nodes;
    curr_size = (count / total_nodes);
    curr_increment = curr_size * extent;
    int recv_size = curr_size;

    /* receive data */
    if (myordering == (total_nodes - 1))
      recv_size += curr_remainder;
    smpi_mpi_recv((char *) buf + (myordering * curr_increment), recv_size, datatype,
             0, tag, comm, &status);

    /* at this point all nodes in this set perform all-gather operation */
    to = header_buf[myordering + 1];
    from = header_buf[myordering - 1];
    if (myordering == 0)
      from = header_buf[total_nodes - 1];
    if (myordering == (total_nodes - 1))
      to = header_buf[0];


    /* last segment may have a larger size since it also include the remainder */
    int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent;


    /* allgather */
    for (i = 0; i < total_nodes - 1; i++) {
      send_offset =
          ((myordering - i + total_nodes) % total_nodes) * curr_increment;
      recv_offset =
          ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment;

      /* adjust size */
      if (send_offset != last_segment_ptr)
        send_count = curr_size;
      else
        send_count = curr_size + curr_remainder;

      if (recv_offset != last_segment_ptr)
        recv_count = curr_size;
      else
        recv_count = curr_size + curr_remainder;

      //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count);
      //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count);


      smpi_mpi_sendrecv((char *) buf + send_offset, send_count, datatype, to,
                   tag + i, (char *) buf + recv_offset, recv_count, datatype,
                   from, tag + i, comm, &status);
    }
  }                             /* non-root */

  return MPI_SUCCESS;
}
Пример #22
0
int smpi_coll_tuned_allgatherv_mpich_rdb (
  void *sendbuf,
  int sendcount,
  MPI_Datatype sendtype,
  void *recvbuf,
  int *recvcounts,
  int *displs,
  MPI_Datatype recvtype,
  MPI_Comm comm)
{
  int        comm_size, rank, j, i;
  MPI_Status status;
  MPI_Aint  recvtype_extent, recvtype_true_extent, recvtype_true_lb;
  int curr_cnt, dst, total_count;
  void *tmp_buf, *tmp_buf_rl;
  int mask, dst_tree_root, my_tree_root, position,
    send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
    offset, tmp_mask, tree_root;

  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  total_count = 0;
  for (i=0; i<comm_size; i++)
    total_count += recvcounts[i];

  if (total_count == 0) return MPI_ERR_COUNT;

  recvtype_extent=smpi_datatype_get_extent( recvtype);

  /* need to receive contiguously into tmp_buf because
     displs could make the recvbuf noncontiguous */

  smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);

  tmp_buf_rl= (void*)smpi_get_tmp_sendbuffer(total_count*(MAX(recvtype_true_extent,recvtype_extent)));

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf_rl - recvtype_true_lb);

  /* copy local data into right location in tmp_buf */
  position = 0;
  for (i=0; i<rank; i++) position += recvcounts[i];
  if (sendbuf != MPI_IN_PLACE)
  {
    smpi_datatype_copy(sendbuf, sendcount, sendtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  else
  {
    /* if in_place specified, local data is found in recvbuf */
    smpi_datatype_copy(((char *)recvbuf +
                        displs[rank]*recvtype_extent),
                       recvcounts[rank], recvtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  curr_cnt = recvcounts[rank];

  mask = 0x1;
  i = 0;
  while (mask < comm_size) {
    dst = rank ^ mask;

    /* find offset into send and recv buffers. zero out
       the least significant "i" bits of rank and dst to
       find root of src and dst subtrees. Use ranks of
       roots as index to send from and recv into buffer */

    dst_tree_root = dst >> i;
    dst_tree_root <<= i;

    my_tree_root = rank >> i;
    my_tree_root <<= i;

    if (dst < comm_size) {
      send_offset = 0;
      for (j=0; j<my_tree_root; j++)
        send_offset += recvcounts[j];

      recv_offset = 0;
      for (j=0; j<dst_tree_root; j++)
        recv_offset += recvcounts[j];

      smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
                        curr_cnt, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        ((char *)tmp_buf + recv_offset * recvtype_extent),
                        total_count - recv_offset, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        comm, &status);
      /* for convenience, recv is posted for a bigger amount
         than will be sent */
      last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
      curr_cnt += last_recv_cnt;
    }

    /* if some processes in this process's subtree in this step
       did not have any destination process to communicate with
       because of non-power-of-two, we need to send them the
       data that they would normally have received from those
       processes. That is, the haves in this subtree must send to
       the havenots. We use a logarithmic
       recursive-halfing algorithm for this. */

    /* This part of the code will not currently be
       executed because we are not using recursive
       doubling for non power of two. Mark it as experimental
       so that it doesn't show up as red in the coverage
       tests. */

    /* --BEGIN EXPERIMENTAL-- */
    if (dst_tree_root + mask > comm_size) {
      nprocs_completed = comm_size - my_tree_root - mask;
      /* nprocs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */
      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      tmp_mask = mask >> 1;

      while (tmp_mask) {
        dst = rank ^ tmp_mask;

        tree_root = rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. at any step, multiple processes
           can send if they have the data */
        if ((dst > rank) &&
            (rank < tree_root + nprocs_completed)
            && (dst >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];
          offset *= recvtype_extent;

          smpi_mpi_send(((char *)tmp_buf + offset),
                        last_recv_cnt,
                        recvtype, dst,
                        COLL_TAG_ALLGATHERV, comm);
          /* last_recv_cnt was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((dst < rank) &&
                 (dst < tree_root + nprocs_completed) &&
                 (rank >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];

          smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
                        total_count - offset, recvtype,
                        dst, COLL_TAG_ALLGATHERV,
                        comm, &status);
          /* for convenience, recv is posted for a
             bigger amount than will be sent */
          last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
          curr_cnt += last_recv_cnt;
        }
        tmp_mask >>= 1;
        k--;
      }
    }
    /* --END EXPERIMENTAL-- */

    mask <<= 1;
    i++;
  }
Пример #23
0
int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
                                      MPI_Datatype stype, void *rbuf,
                                      int rcount, MPI_Datatype rtype,
                                      MPI_Comm comm)
{
  int src, dst, comm_size, rank;
  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);
  MPI_Aint rextent, sextent;
  rextent = smpi_datatype_get_extent(rtype);
  sextent = smpi_datatype_get_extent(stype);
  int tag = COLL_TAG_ALLGATHER;
  MPI_Request request;
  MPI_Request rrequest_array[128];

  MPI_Status status;
  int i, send_offset, recv_offset;
  int intra_rank, inter_rank;
  intra_rank = rank % NUM_CORE;
  inter_rank = rank / NUM_CORE;
  int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
  int num_core_in_current_smp = NUM_CORE;

  /* for too small number of processes, use default implementation */
  if (comm_size <= NUM_CORE) {
    XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");  	  
    smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;    
  }

  // the last SMP node may have fewer number of running processes than all others
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
  }
  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
               ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
               &status);

  //gather to root of each SMP

  for (i = 1; i < num_core_in_current_smp; i++) {

    dst =
        (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
    src =
        (inter_rank * NUM_CORE) + (intra_rank - i +
                                   num_core_in_current_smp) %
        (num_core_in_current_smp);
    recv_offset = src * rextent * rcount;

    smpi_mpi_sendrecv(sbuf, scount, stype, dst, tag,
                 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
                 &status);

  }

  // INTER-SMP-ALLGATHER 
  // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
  // Use logical ring algorithm

  // root of each SMP
  if (intra_rank == 0) {
    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
    dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;

    // post all inter Irecv
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      rrequest_array[i] = smpi_mpi_irecv((char *)rbuf+recv_offset, rcount * NUM_CORE, rtype, src, tag+i, comm);
    }

    // send first message
    send_offset =
        ((inter_rank +
          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
    smpi_mpi_isend((char *) rbuf + send_offset, scount * NUM_CORE, stype,
                   dst, tag, comm);

    // loop : recv-inter , send-inter, send-intra (linear-bcast)
    for (i = 0; i < inter_comm_size - 2; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      smpi_mpi_wait(&rrequest_array[i], &status);
      smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                     dst, tag + i + 1, comm);
      if (num_core_in_current_smp > 1) {
        request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                  (rank + 1), tag + i + 1, comm);
      }
    }

    // recv last message and send_intra
    recv_offset =
        ((inter_rank - i - 1 +
          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
    //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
    //i=inter_comm_size-2;
    smpi_mpi_wait(&rrequest_array[i], &status);
    if (num_core_in_current_smp > 1) {
      request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                (rank + 1), tag + i + 1, comm);
    }
  }
  // last rank of each SMP
  else if (intra_rank == (num_core_in_current_smp - 1)) {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
                rank - 1, tag + i + 1, comm);
      smpi_mpi_wait(&request, &status);
    }
  }
  // intermediate rank of each SMP
  else {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
                rank - 1, tag + i + 1, comm);
      smpi_mpi_wait(&request, &status);
      request = smpi_mpi_isend((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
                (rank + 1), tag + i + 1, comm);
    }
  }

  return MPI_SUCCESS;
}
Пример #24
0
/*
This fucntion performs all-reduce operation as follow.
1) binomial_tree reduce inside each SMP node
2) reduce-scatter -inter between root of each SMP node
3) allgather - inter between root of each SMP node
4) binomial_tree bcast inside each SMP node
*/
int smpi_coll_tuned_allreduce_smp_rsag(void *send_buf, void *recv_buf,
                                       int count, MPI_Datatype dtype, MPI_Op op,
                                       MPI_Comm comm)
{
  int comm_size, rank;
  void *tmp_buf;
  int tag = COLL_TAG_ALLREDUCE;
  int mask, src, dst;
  MPI_Status status;
  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
    smpi_comm_init_smp(comm);
  }
  int num_core=1;
  if (smpi_comm_is_uniform(comm)){
    num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  }
  /*
     #ifdef MPICH2_REDUCTION
     MPI_User_function * uop = MPIR_Op_table[op % 16 - 1];
     #else
     MPI_User_function *uop;
     struct MPIR_OP *op_ptr;
     op_ptr = MPIR_ToPointer(op);
     uop  = op_ptr->op;
     #endif
   */
  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(dtype);
  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);

  int intra_rank, inter_rank;
  intra_rank = rank % num_core;
  inter_rank = rank / num_core;

  //printf("node %d intra_rank = %d, inter_rank = %d\n", rank, intra_rank, inter_rank);

  int inter_comm_size = (comm_size + num_core - 1) / num_core;

  if (!rank) {
    //printf("intra com size = %d\n",num_core);
    //printf("inter com size = %d\n",inter_comm_size);
  }


  smpi_mpi_sendrecv(send_buf, count, dtype, rank, tag,
               recv_buf, count, dtype, rank, tag, comm, &status);


  // SMP_binomial_reduce
  mask = 1;
  while (mask < num_core) {
    if ((mask & intra_rank) == 0) {
      src = (inter_rank * num_core) + (intra_rank | mask);
      //      if (src < ((inter_rank + 1) * num_core)) {
      if (src < comm_size) {
        smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
        smpi_op_apply(op, tmp_buf, recv_buf, &count, &dtype);
        //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
      }
    } else {

      dst = (inter_rank * num_core) + (intra_rank & (~mask));
      smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
      //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
      break;
    }
    mask <<= 1;
  }



  // INTER: reduce-scatter
  if (intra_rank == 0) {
    int send_offset, recv_offset;
    int seg_count = count / inter_comm_size;
    int to = ((inter_rank + 1) % inter_comm_size) * num_core;
    int from =
        ((inter_rank + inter_comm_size - 1) % inter_comm_size) * num_core;
    int i;

    //printf("node %d to %d from %d\n",rank,to,from);

    for (i = 0; i < (inter_comm_size - 1); i++) {

      send_offset =
          ((inter_rank - 1 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;
      recv_offset =
          ((inter_rank - 2 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;

      smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
                   tag + i, tmp_buf, seg_count, dtype, from, tag + i, comm,
                   &status);

      // result is in rbuf
      smpi_op_apply(op, tmp_buf, (char *) recv_buf + recv_offset, &seg_count,
                     &dtype);
    }

    // INTER: allgather
    for (i = 0; i < (inter_comm_size - 1); i++) {

      send_offset =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;
      recv_offset =
          ((inter_rank - 1 - i +
            inter_comm_size) % inter_comm_size) * seg_count * extent;

      smpi_mpi_sendrecv((char *) recv_buf + send_offset, seg_count, dtype, to,
                   tag + i, (char *) recv_buf + recv_offset, seg_count, dtype,
                   from, tag + i, comm, &status);

    }
  }



     // INTER_binomial_reduce

     // only root node for each SMP
//     if (intra_rank == 0) {
//
//     mask = 1;
//     while (mask < inter_comm_size) {
//     if ((mask & inter_rank) == 0) {
//     src = (inter_rank | mask) * num_core;
//     if (src < comm_size) {
//     smpi_mpi_recv(tmp_buf, count, dtype, src, tag, comm, &status);
//     (* uop) (tmp_buf, recv_buf, &count, &dtype);
     //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
//     }
//     }
//     else {
//     dst = (inter_rank & (~mask)) * num_core;
//     smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
     //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
//     break;
//     }
//     mask <<=1;
//     }
//     }

     // INTER_binomial_bcast


//     if (intra_rank == 0) {
//     mask = 1;
//     while (mask < inter_comm_size) {
//     if (inter_rank & mask) {
//     src = (inter_rank - mask) * num_core;
     //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
//     smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
//     break;
//     }
//     mask <<= 1;
//     }
//
//     mask >>= 1;
     //printf("My rank = %d my mask = %d\n", rank,mask);

//     while (mask > 0) {
//     if (inter_rank < inter_comm_size) {
//     dst = (inter_rank + mask) * num_core;
//     if (dst < comm_size) {
     //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
//     smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
//     }
//     }
//     mask >>= 1;
//     }
//     }


  // INTRA_binomial_bcast

  int num_core_in_current_smp = num_core;
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * num_core);
  }
  //  printf("Node %d num_core = %d\n",rank, num_core_in_current_smp);
  mask = 1;
  while (mask < num_core_in_current_smp) {
    if (intra_rank & mask) {
      src = (inter_rank * num_core) + (intra_rank - mask);
      //printf("Node %d recv from node %d when mask is %d\n", rank, src, mask);
      smpi_mpi_recv(recv_buf, count, dtype, src, tag, comm, &status);
      break;
    }
    mask <<= 1;
  }

  mask >>= 1;
  //printf("My rank = %d my mask = %d\n", rank,mask);

  while (mask > 0) {
    dst = (inter_rank * num_core) + (intra_rank + mask);
    if (dst < comm_size) {
      //printf("Node %d send to node %d when mask is %d\n", rank, dst, mask);
      smpi_mpi_send(recv_buf, count, dtype, dst, tag, comm);
    }
    mask >>= 1;
  }

  smpi_free_tmp_buffer(tmp_buf);
  return MPI_SUCCESS;
}
Пример #25
0
/*****************************************************************************
 * Function: allgather_bruck
 * return: int
 * inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Function realizes the allgather operation using the bruck
 *         algorithm.
 * Auther: MPICH
 * Comment: Original bruck algorithm from MPICH is slightly modified by
 *          Ahmad Faraj.  
 ****************************************************************************/
int smpi_coll_tuned_allgather_bruck(void *send_buff, int send_count,
                                    MPI_Datatype send_type, void *recv_buff,
                                    int recv_count, MPI_Datatype recv_type,
                                    MPI_Comm comm)
{
  // MPI variables
  MPI_Status status;
  MPI_Aint recv_extent;

  // local int variables
  int src, dst, rank, num_procs, count, remainder;
  int tag = COLL_TAG_ALLGATHER;
  int pof2 = 1;

  // local string variables
  char *tmp_buff;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  // get size of the communicator, followed by rank 
  num_procs = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  // get size of single element's type for recv buffer
  recv_extent = smpi_datatype_get_extent(recv_type);

  count = recv_count;

  tmp_buff = (char *) xbt_malloc(num_procs * recv_count * recv_extent);

  // perform a local copy
  smpi_datatype_copy(send_ptr, send_count, send_type,
		     tmp_buff, recv_count, recv_type);
  while (pof2 <= (num_procs / 2)) {
    src = (rank + pof2) % num_procs;
    dst = (rank - pof2 + num_procs) % num_procs;

    smpi_mpi_sendrecv(tmp_buff, count, recv_type, dst, tag,
                  tmp_buff + count * recv_extent, count, recv_type,
                  src, tag, comm, &status);
    count *= 2;
    pof2 *= 2;
  }

  remainder = num_procs - pof2;
  if (remainder) {
    src = (rank + pof2) % num_procs;
    dst = (rank - pof2 + num_procs) % num_procs;

    smpi_mpi_sendrecv(tmp_buff, remainder * recv_count, recv_type, dst, tag,
                  tmp_buff + count * recv_extent, remainder * recv_count,
                  recv_type, src, tag, comm, &status);
  }

  smpi_mpi_sendrecv(tmp_buff, (num_procs - rank) * recv_count, recv_type, rank,
                tag, recv_ptr + rank * recv_count * recv_extent,
                (num_procs - rank) * recv_count, recv_type, rank, tag, comm,
                &status);

  if (rank)
    smpi_mpi_sendrecv(tmp_buff + (num_procs - rank) * recv_count * recv_extent,
                  rank * recv_count, recv_type, rank, tag, recv_ptr,
                  rank * recv_count, recv_type, rank, tag, comm, &status);
  free(tmp_buff);
  return MPI_SUCCESS;
}
Пример #26
0
int smpi_coll_tuned_allreduce_rab_rsag(void *sbuff, void *rbuff, int count,
                                       MPI_Datatype dtype, MPI_Op op,
                                       MPI_Comm comm)
{
  int nprocs, rank, tag = COLL_TAG_ALLREDUCE;
  int mask, dst, pof2, newrank, rem, newdst, i,
      send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
  MPI_Aint extent;
  MPI_Status status;
  void *tmp_buf = NULL;
  nprocs = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  extent = smpi_datatype_get_extent(dtype);
  tmp_buf = (void *) xbt_malloc(count * extent);

  smpi_mpi_sendrecv(sbuff, count, dtype, rank, tag, rbuff, count, dtype, rank, tag,
               comm, &status);

  // find nearest power-of-two less than or equal to comm_size
  pof2 = 1;
  while (pof2 <= nprocs)
    pof2 <<= 1;
  pof2 >>= 1;

  rem = nprocs - pof2;

  // In the non-power-of-two case, all even-numbered
  // processes of rank < 2*rem send their data to
  // (rank+1). These even-numbered processes no longer
  // participate in the algorithm until the very end. The
  // remaining processes form a nice power-of-two. 

  if (rank < 2 * rem) {
    // even       
    if (rank % 2 == 0) {

      smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm);

      // temporarily set the rank to -1 so that this
      // process does not pariticipate in recursive
      // doubling
      newrank = -1;
    } else                      // odd
    {
      smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
      // do the reduction on received data. since the
      // ordering is right, it doesn't matter whether
      // the operation is commutative or not.
      smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype);

      // change the rank 
      newrank = rank / 2;
    }
  }

  else                          // rank >= 2 * rem 
    newrank = rank - rem;

  // If op is user-defined or count is less than pof2, use
  // recursive doubling algorithm. Otherwise do a reduce-scatter
  // followed by allgather. (If op is user-defined,
  // derived datatypes are allowed and the user could pass basic
  // datatypes on one process and derived on another as long as
  // the type maps are the same. Breaking up derived
  // datatypes to do the reduce-scatter is tricky, therefore
  // using recursive doubling in that case.) 

  if (newrank != -1) {
    // do a reduce-scatter followed by allgather. for the
    // reduce-scatter, calculate the count that each process receives
    // and the displacement within the buffer 

    cnts = (int *) xbt_malloc(pof2 * sizeof(int));
    disps = (int *) xbt_malloc(pof2 * sizeof(int));

    for (i = 0; i < (pof2 - 1); i++)
      cnts[i] = count / pof2;
    cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

    disps[0] = 0;
    for (i = 1; i < pof2; i++)
      disps[i] = disps[i - 1] + cnts[i - 1];

    mask = 0x1;
    send_idx = recv_idx = 0;
    last_idx = pof2;
    while (mask < pof2) {
      newdst = newrank ^ mask;
      // find real rank of dest 
      dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

      send_cnt = recv_cnt = 0;
      if (newrank < newdst) {
        send_idx = recv_idx + pof2 / (mask * 2);
        for (i = send_idx; i < last_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < send_idx; i++)
          recv_cnt += cnts[i];
      } else {
        recv_idx = send_idx + pof2 / (mask * 2);
        for (i = send_idx; i < recv_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < last_idx; i++)
          recv_cnt += cnts[i];
      }

      // Send data from recvbuf. Recv into tmp_buf 
      smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
                   dtype, dst, tag,
                   (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt,
                   dtype, dst, tag, comm, &status);

      // tmp_buf contains data received in this step.
      // recvbuf contains data accumulated so far 

      // This algorithm is used only for predefined ops
      // and predefined ops are always commutative.
      smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                     (char *) rbuff + disps[recv_idx] * extent,
                     &recv_cnt, &dtype);

      // update send_idx for next iteration 
      send_idx = recv_idx;
      mask <<= 1;

      // update last_idx, but not in last iteration because the value
      // is needed in the allgather step below. 
      if (mask < pof2)
        last_idx = recv_idx + pof2 / mask;
    }

    // now do the allgather 

    mask >>= 1;
    while (mask > 0) {
      newdst = newrank ^ mask;
      // find real rank of dest
      dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

      send_cnt = recv_cnt = 0;
      if (newrank < newdst) {
        // update last_idx except on first iteration 
        if (mask != pof2 / 2)
          last_idx = last_idx + pof2 / (mask * 2);

        recv_idx = send_idx + pof2 / (mask * 2);
        for (i = send_idx; i < recv_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < last_idx; i++)
          recv_cnt += cnts[i];
      } else {
        recv_idx = send_idx - pof2 / (mask * 2);
        for (i = send_idx; i < last_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < send_idx; i++)
          recv_cnt += cnts[i];
      }

      smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
                   dtype, dst, tag,
                   (char *) rbuff + disps[recv_idx] * extent, recv_cnt,
                   dtype, dst, tag, comm, &status);

      if (newrank > newdst)
        send_idx = recv_idx;

      mask >>= 1;
    }

    free(cnts);
    free(disps);

  }
Пример #27
0
int
smpi_coll_tuned_bcast_scatter_rdb_allgather(void *buff, int count, MPI_Datatype
                                            data_type, int root, MPI_Comm comm)
{
  MPI_Aint extent;
  MPI_Status status;

  int i, j, k, src, dst, rank, num_procs, send_offset, recv_offset;
  int mask, relative_rank, curr_size, recv_size = 0, send_size, nbytes;
  int scatter_size, tree_root, relative_dst, dst_tree_root;
  int my_tree_root, offset, tmp_mask, num_procs_completed;
  int tag = COLL_TAG_BCAST;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(data_type);

  nbytes = extent * count;
  scatter_size = (nbytes + num_procs - 1) / num_procs;  // ceiling division 
  curr_size = (rank == root) ? nbytes : 0;      // root starts with all the data
  relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;

  mask = 0x1;
  while (mask < num_procs) {
    if (relative_rank & mask) {
      src = rank - mask;
      if (src < 0)
        src += num_procs;
      recv_size = nbytes - relative_rank * scatter_size;
      //  recv_size is larger than what might actually be sent by the
      //  sender. We don't need compute the exact value because MPI
      //  allows you to post a larger recv.
      if (recv_size <= 0)
        curr_size = 0;          // this process doesn't receive any data
      // because of uneven division 
      else {
        smpi_mpi_recv((char *)buff + relative_rank * scatter_size, recv_size,
                 MPI_BYTE, src, tag, comm, &status);
        curr_size = smpi_mpi_get_count(&status, MPI_BYTE);
      }
      break;
    }
    mask <<= 1;
  }

  // This process is responsible for all processes that have bits
  // set from the LSB upto (but not including) mask.  Because of
  // the "not including", we start by shifting mask back down
  // one.

  mask >>= 1;
  while (mask > 0) {
    if (relative_rank + mask < num_procs) {
      send_size = curr_size - scatter_size * mask;
      // mask is also the size of this process's subtree 

      if (send_size > 0) {
        dst = rank + mask;
        if (dst >= num_procs)
          dst -= num_procs;
        smpi_mpi_send((char *)buff + scatter_size * (relative_rank + mask),
                 send_size, MPI_BYTE, dst, tag, comm);

        curr_size -= send_size;
      }
    }
    mask >>= 1;
  }

  // done scatter now do allgather


  mask = 0x1;
  i = 0;
  while (mask < num_procs) {
    relative_dst = relative_rank ^ mask;

    dst = (relative_dst + root) % num_procs;

    /* find offset into send and recv buffers.
       zero out the least significant "i" bits of relative_rank and
       relative_dst to find root of src and dst
       subtrees. Use ranks of roots as index to send from
       and recv into  buffer */

    dst_tree_root = relative_dst >> i;
    dst_tree_root <<= i;

    my_tree_root = relative_rank >> i;
    my_tree_root <<= i;

    send_offset = my_tree_root * scatter_size;
    recv_offset = dst_tree_root * scatter_size;

    if (relative_dst < num_procs) {
      smpi_mpi_sendrecv((char *)buff + send_offset, curr_size, MPI_BYTE, dst, tag,
                   (char *)buff + recv_offset, scatter_size * mask, MPI_BYTE, dst,
                   tag, comm, &status);
      recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
      curr_size += recv_size;
    }

    /* if some processes in this process's subtree in this step
       did not have any destination process to communicate with
       because of non-power-of-two, we need to send them the
       data that they would normally have received from those
       processes. That is, the haves in this subtree must send to
       the havenots. We use a logarithmic recursive-halfing algorithm
       for this. */

    if (dst_tree_root + mask > num_procs) {
      num_procs_completed = num_procs - my_tree_root - mask;
      /* num_procs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */
      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      offset = scatter_size * (my_tree_root + mask);
      tmp_mask = mask >> 1;

      while (tmp_mask) {
        relative_dst = relative_rank ^ tmp_mask;
        dst = (relative_dst + root) % num_procs;

        tree_root = relative_rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. */

        if ((relative_dst > relative_rank)
            && (relative_rank < tree_root + num_procs_completed)
            && (relative_dst >= tree_root + num_procs_completed)) {
          smpi_mpi_send((char *)buff + offset, recv_size, MPI_BYTE, dst, tag, comm);

          /* recv_size was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((relative_dst < relative_rank)
                 && (relative_dst < tree_root + num_procs_completed)
                 && (relative_rank >= tree_root + num_procs_completed)) {

          smpi_mpi_recv((char *)buff + offset, scatter_size * num_procs_completed,
                   MPI_BYTE, dst, tag, comm, &status);

          /* num_procs_completed is also equal to the no. of processes
             whose data we don't have */
          recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
          curr_size += recv_size;
        }
        tmp_mask >>= 1;
        k--;
      }
    }
    mask <<= 1;
    i++;
  }
Пример #28
0
 int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf,
                             void *recvbuf,
                             int count,
                             MPI_Datatype datatype,
                             MPI_Op op, MPI_Comm comm)
{
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i,
        send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf, *tmp_buf_free;

    if (count == 0) {
        return MPI_SUCCESS;
    }

    /* homogeneous */

    comm_size =  smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    is_commutative = smpi_op_is_commute(op);

    /* need to allocate temporary buffer to store incoming data */
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent)));

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *) ((char *) tmp_buf_free - true_lb);

    /* copy local data into recvbuf */
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno =
            smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count,
                           datatype);
    }

    /* find nearest power-of-two less than or equal to comm_size */
    for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
    pof2 >>=1;

    rem = comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end. The
       remaining processes form a nice power-of-two. */

    if (rank < 2 * rem) {
        if (rank % 2 == 0) {
            /* even */
            smpi_mpi_send(recvbuf, count, datatype, rank + 1,
                                     COLL_TAG_ALLREDUCE, comm);

            /* temporarily set the rank to -1 so that this
               process does not pariticipate in recursive
               doubling */
            newrank = -1;
        } else {
            /* odd */
            smpi_mpi_recv(tmp_buf, count, datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE);
            /* do the reduction on received data. since the
               ordering is right, it doesn't matter whether
               the operation is commutative or not. */
               smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                /* change the rank */
                newrank = rank / 2;
        }
    } else {                /* rank >= 2*rem */
        newrank = rank - rem;
    }

    /* If op is user-defined or count is less than pof2, use
       recursive doubling algorithm. Otherwise do a reduce-scatter
       followed by allgather. (If op is user-defined,
       derived datatypes are allowed and the user could pass basic
       datatypes on one process and derived on another as long as
       the type maps are the same. Breaking up derived
       datatypes to do the reduce-scatter is tricky, therefore
       using recursive doubling in that case.) */

    if (newrank != -1) {
        if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) {  /* use recursive doubling */
            mask = 0x1;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                /* Send the most current data, which is in recvbuf. Recv
                   into tmp_buf */
                smpi_mpi_sendrecv(recvbuf, count, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             tmp_buf, count, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                if (is_commutative || (dst < rank)) {
                    /* op is commutative OR the order is already right */
                     smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                } else {
                    /* op is noncommutative and the order is not right */
                    smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype,
                                               recvbuf, count, datatype);
                }
                mask <<= 1;
            }
        } else {

            /* do a reduce-scatter followed by allgather */

            /* for the reduce-scatter, calculate the count that
               each process receives and the displacement within
               the buffer */
            cnts = (int *)xbt_malloc(pof2 * sizeof (int));
            disps = (int *)xbt_malloc(pof2 * sizeof (int));

            for (i = 0; i < (pof2 - 1); i++) {
                cnts[i] = count / pof2;
            }
            cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

            disps[0] = 0;
            for (i = 1; i < pof2; i++) {
                disps[i] = disps[i - 1] + cnts[i - 1];
            }

            mask = 0x1;
            send_idx = recv_idx = 0;
            last_idx = pof2;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    send_idx = recv_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < send_idx; i++)
                        recv_cnt += cnts[i];
                } else {
                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < last_idx; i++)
                        recv_cnt += cnts[i];
                }

                /* Send data from recvbuf. Recv into tmp_buf */
                smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) tmp_buf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                /* This algorithm is used only for predefined ops
                   and predefined ops are always commutative. */

                smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                        (char *) recvbuf + disps[recv_idx] * extent,
                        &recv_cnt, &datatype);

                /* update send_idx for next iteration */
                send_idx = recv_idx;
                mask <<= 1;

                /* update last_idx, but not in last iteration
                   because the value is needed in the allgather
                   step below. */
                if (mask < pof2)
                    last_idx = recv_idx + pof2 / mask;
            }

            /* now do the allgather */

            mask >>= 1;
            while (mask > 0) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    /* update last_idx except on first iteration */
                    if (mask != pof2 / 2) {
                        last_idx = last_idx + pof2 / (mask * 2);
                    }

                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < last_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                } else {
                    recv_idx = send_idx - pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < send_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                }

               smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) recvbuf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);
                if (newrank > newdst) {
                    send_idx = recv_idx;
                }

                mask >>= 1;
            }
        }
    }

    /* In the non-power-of-two case, all odd-numbered
       processes of rank < 2*rem send the result to
       (rank-1), the ranks who didn't participate above. */
    if (rank < 2 * rem) {
        if (rank % 2) {     /* odd */
            smpi_mpi_send(recvbuf, count,
                                     datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm);
        } else {            /* even */
            smpi_mpi_recv(recvbuf, count,
                                  datatype, rank + 1,
                                  COLL_TAG_ALLREDUCE, comm,
                                  MPI_STATUS_IGNORE);
        }
    }
    smpi_free_tmp_buffer(tmp_buf_free);
    return (mpi_errno);

}
Пример #29
0
int
smpi_coll_tuned_allgather_rhv(void *sbuf, int send_count,
                              MPI_Datatype send_type, void *rbuf,
                              int recv_count, MPI_Datatype recv_type,
                              MPI_Comm comm)
{
  MPI_Status status;
  MPI_Aint s_extent, r_extent;

  // local int variables
  int i, dst, send_base_offset, recv_base_offset, send_chunk, recv_chunk,
      send_offset, recv_offset;
  int rank, num_procs;
  int tag = 50;
  int mask;
  int curr_count;

  // get size of the communicator, followed by rank 
  num_procs = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  // get size of single element's type for send buffer and recv buffer
  s_extent = smpi_datatype_get_extent(send_type);
  r_extent = smpi_datatype_get_extent(recv_type);

  // multiply size of each element by number of elements to send or recv
  send_chunk = s_extent * send_count;
  recv_chunk = r_extent * recv_count;

  if (send_chunk != recv_chunk) {
    XBT_WARN("MPI_allgather_rhv use default MPI_allgather.");  
    smpi_mpi_allgather(sbuf, send_count, send_type, rbuf, recv_count,
                              recv_type, comm);
    return MPI_SUCCESS;        
  }

  // compute starting offset location to perform local copy
  int size = num_procs / 2;
  int base_offset = 0;
  mask = 1;
  while (mask < num_procs) {
    if (rank & mask) {
      base_offset += size;
    }
    mask <<= 1;
    size /= 2;
  }

  //  printf("node %d base_offset %d\n",rank,base_offset);

  //perform a remote copy

  dst = base_offset;
  smpi_mpi_sendrecv(sbuf, send_count, send_type, dst, tag,
               (char *)rbuf + base_offset * recv_chunk, recv_count, recv_type, dst, tag,
               comm, &status);


  mask >>= 1;
  i = 1;
  int phase = 0;
  curr_count = recv_count;
  while (mask >= 1) {
    // destination pair for both send and recv
    dst = rank ^ mask;

    // compute offsets
    send_base_offset = base_offset;
    if (rank & mask) {
      recv_base_offset = base_offset - i;
      base_offset -= i;
    } else {
      recv_base_offset = base_offset + i;
    }
    send_offset = send_base_offset * recv_chunk;
    recv_offset = recv_base_offset * recv_chunk;

    //  printf("node %d send to %d in phase %d s_offset = %d r_offset = %d count = %d\n",rank,dst,phase, send_base_offset, recv_base_offset, curr_count);

    smpi_mpi_sendrecv((char *)rbuf + send_offset, curr_count, recv_type, dst, tag,
		 (char *)rbuf + recv_offset, curr_count, recv_type, dst, tag,
                 comm, &status);


    curr_count *= 2;
    i *= 2;
    mask >>= 1;
    phase++;
  }

  return MPI_SUCCESS;
}
Пример #30
0
int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf, *tmp_results;
    int mpi_errno = MPI_SUCCESS;
    int dis[2], blklens[2], total_count, dst;
    int mask, dst_tree_root, my_tree_root, j, k;
    int received;
    MPI_Datatype sendtype, recvtype;
    int nprocs_completed, tmp_mask, tree_root, is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */

            /* need to allocate temporary buffer to receive incoming data*/
            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);

            /* need to allocate another temporary buffer to accumulate
               results */
            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_results = (void *)((char*)tmp_results - true_lb);

            /* copy sendbuf into tmp_results */
            if (sendbuf != MPI_IN_PLACE)
                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);
            else
                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);

            if (mpi_errno) return(mpi_errno);

            mask = 0x1;
            i = 0;
            while (mask < comm_size) {
                dst = rank ^ mask;

                dst_tree_root = dst >> i;
                dst_tree_root <<= i;

                my_tree_root = rank >> i;
                my_tree_root <<= i;

                /* At step 1, processes exchange (n-n/p) amount of
                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
                   amount of data, and so forth. We use derived datatypes for this.

                   At each step, a process does not need to send data
                   indexed from my_tree_root to
                   my_tree_root+mask-1. Similarly, a process won't receive
                   data indexed from dst_tree_root to dst_tree_root+mask-1. */

                /* calculate sendtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<my_tree_root; j++)
                    blklens[0] += recvcounts[j];
                for (j=my_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&sendtype);

                /* calculate recvtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<dst_tree_root && j<comm_size; j++)
                    blklens[0] += recvcounts[j];
                for (j=dst_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&recvtype);

                received = 0;
                if (dst < comm_size) {
                    /* tmp_results contains data to be sent in each step. Data is
                       received in tmp_recvbuf and then accumulated into
                       tmp_results. accumulation is done later below.   */ 

                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
                                                 COLL_TAG_SCATTER,
                                                 tmp_recvbuf, 1, recvtype, dst,
                                                 COLL_TAG_SCATTER, comm,
                                                 MPI_STATUS_IGNORE);
                    received = 1;
                }

                /* if some processes in this process's subtree in this step
                   did not have any destination process to communicate with
                   because of non-power-of-two, we need to send them the
                   result. We use a logarithmic recursive-halfing algorithm
                   for this. */

                if (dst_tree_root + mask > comm_size) {
                    nprocs_completed = comm_size - my_tree_root - mask;
                    /* nprocs_completed is the number of processes in this
                       subtree that have all the data. Send data to others
                       in a tree fashion. First find root of current tree
                       that is being divided into two. k is the number of
                       least-significant bits in this process's rank that
                       must be zeroed out to find the rank of the root */ 
                    j = mask;
                    k = 0;
                    while (j) {
                        j >>= 1;
                        k++;
                    }
                    k--;

                    tmp_mask = mask >> 1;
                    while (tmp_mask) {
                        dst = rank ^ tmp_mask;

                        tree_root = rank >> k;
                        tree_root <<= k;

                        /* send only if this proc has data and destination
                           doesn't have data. at any step, multiple processes
                           can send if they have the data */
                        if ((dst > rank) && 
                            (rank < tree_root + nprocs_completed)
                            && (dst >= tree_root + nprocs_completed)) {
                            /* send the current result */
                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
                                                     dst, COLL_TAG_SCATTER,
                                                     comm);
                        }
                        /* recv only if this proc. doesn't have data and sender
                           has data */
                        else if ((dst < rank) && 
                                 (dst < tree_root + nprocs_completed) &&
                                 (rank >= tree_root + nprocs_completed)) {
                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
                                                     COLL_TAG_SCATTER,
                                                     comm, MPI_STATUS_IGNORE); 
                            received = 1;
                        }
                        tmp_mask >>= 1;
                        k--;
                    }
                }

                /* The following reduction is done here instead of after 
                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
                   because to do it above, in the noncommutative 
                   case, we would need an extra temp buffer so as not to
                   overwrite temp_recvbuf, because temp_recvbuf may have
                   to be communicated to other processes in the
                   non-power-of-two case. To avoid that extra allocation,
                   we do the reduce here. */
                if (received) {
                    if (is_commutative || (dst_tree_root < my_tree_root)) {
                        {
			         smpi_op_apply(op, 
                               tmp_recvbuf, tmp_results, &blklens[0],
			       &datatype); 
			        smpi_op_apply(op, 
                               ((char *)tmp_recvbuf + dis[1]*extent),
			       ((char *)tmp_results + dis[1]*extent),
			       &blklens[1], &datatype); 
                        }
                    }
                    else {
                        {
			         smpi_op_apply(op,
                                   tmp_results, tmp_recvbuf, &blklens[0],
                                   &datatype); 
			         smpi_op_apply(op,
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }