Exemple #1
0
int smpi_coll_tuned_allgatherv_mpich_rdb (
  void *sendbuf,
  int sendcount,
  MPI_Datatype sendtype,
  void *recvbuf,
  int *recvcounts,
  int *displs,
  MPI_Datatype recvtype,
  MPI_Comm comm)
{
  int        comm_size, rank, j, i;
  MPI_Status status;
  MPI_Aint  recvtype_extent, recvtype_true_extent, recvtype_true_lb;
  int curr_cnt, dst, total_count;
  void *tmp_buf, *tmp_buf_rl;
  int mask, dst_tree_root, my_tree_root, position,
    send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
    offset, tmp_mask, tree_root;

  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  total_count = 0;
  for (i=0; i<comm_size; i++)
    total_count += recvcounts[i];

  if (total_count == 0) return MPI_ERR_COUNT;

  recvtype_extent=smpi_datatype_get_extent( recvtype);

  /* need to receive contiguously into tmp_buf because
     displs could make the recvbuf noncontiguous */

  smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);

  tmp_buf_rl= (void*)smpi_get_tmp_sendbuffer(total_count*(MAX(recvtype_true_extent,recvtype_extent)));

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf_rl - recvtype_true_lb);

  /* copy local data into right location in tmp_buf */
  position = 0;
  for (i=0; i<rank; i++) position += recvcounts[i];
  if (sendbuf != MPI_IN_PLACE)
  {
    smpi_datatype_copy(sendbuf, sendcount, sendtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  else
  {
    /* if in_place specified, local data is found in recvbuf */
    smpi_datatype_copy(((char *)recvbuf +
                        displs[rank]*recvtype_extent),
                       recvcounts[rank], recvtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  curr_cnt = recvcounts[rank];

  mask = 0x1;
  i = 0;
  while (mask < comm_size) {
    dst = rank ^ mask;

    /* find offset into send and recv buffers. zero out
       the least significant "i" bits of rank and dst to
       find root of src and dst subtrees. Use ranks of
       roots as index to send from and recv into buffer */

    dst_tree_root = dst >> i;
    dst_tree_root <<= i;

    my_tree_root = rank >> i;
    my_tree_root <<= i;

    if (dst < comm_size) {
      send_offset = 0;
      for (j=0; j<my_tree_root; j++)
        send_offset += recvcounts[j];

      recv_offset = 0;
      for (j=0; j<dst_tree_root; j++)
        recv_offset += recvcounts[j];

      smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
                        curr_cnt, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        ((char *)tmp_buf + recv_offset * recvtype_extent),
                        total_count - recv_offset, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        comm, &status);
      /* for convenience, recv is posted for a bigger amount
         than will be sent */
      last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
      curr_cnt += last_recv_cnt;
    }

    /* if some processes in this process's subtree in this step
       did not have any destination process to communicate with
       because of non-power-of-two, we need to send them the
       data that they would normally have received from those
       processes. That is, the haves in this subtree must send to
       the havenots. We use a logarithmic
       recursive-halfing algorithm for this. */

    /* This part of the code will not currently be
       executed because we are not using recursive
       doubling for non power of two. Mark it as experimental
       so that it doesn't show up as red in the coverage
       tests. */

    /* --BEGIN EXPERIMENTAL-- */
    if (dst_tree_root + mask > comm_size) {
      nprocs_completed = comm_size - my_tree_root - mask;
      /* nprocs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */
      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      tmp_mask = mask >> 1;

      while (tmp_mask) {
        dst = rank ^ tmp_mask;

        tree_root = rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. at any step, multiple processes
           can send if they have the data */
        if ((dst > rank) &&
            (rank < tree_root + nprocs_completed)
            && (dst >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];
          offset *= recvtype_extent;

          smpi_mpi_send(((char *)tmp_buf + offset),
                        last_recv_cnt,
                        recvtype, dst,
                        COLL_TAG_ALLGATHERV, comm);
          /* last_recv_cnt was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((dst < rank) &&
                 (dst < tree_root + nprocs_completed) &&
                 (rank >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];

          smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
                        total_count - offset, recvtype,
                        dst, COLL_TAG_ALLGATHERV,
                        comm, &status);
          /* for convenience, recv is posted for a
             bigger amount than will be sent */
          last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
          curr_cnt += last_recv_cnt;
        }
        tmp_mask >>= 1;
        k--;
      }
    }
    /* --END EXPERIMENTAL-- */

    mask <<= 1;
    i++;
  }
int
smpi_coll_tuned_bcast_scatter_rdb_allgather(void *buff, int count, MPI_Datatype
                                            data_type, int root, MPI_Comm comm)
{
  MPI_Aint extent;
  MPI_Status status;

  int i, j, k, src, dst, rank, num_procs, send_offset, recv_offset;
  int mask, relative_rank, curr_size, recv_size = 0, send_size, nbytes;
  int scatter_size, tree_root, relative_dst, dst_tree_root;
  int my_tree_root, offset, tmp_mask, num_procs_completed;
  int tag = COLL_TAG_BCAST;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(data_type);

  nbytes = extent * count;
  scatter_size = (nbytes + num_procs - 1) / num_procs;  // ceiling division 
  curr_size = (rank == root) ? nbytes : 0;      // root starts with all the data
  relative_rank = (rank >= root) ? rank - root : rank - root + num_procs;

  mask = 0x1;
  while (mask < num_procs) {
    if (relative_rank & mask) {
      src = rank - mask;
      if (src < 0)
        src += num_procs;
      recv_size = nbytes - relative_rank * scatter_size;
      //  recv_size is larger than what might actually be sent by the
      //  sender. We don't need compute the exact value because MPI
      //  allows you to post a larger recv.
      if (recv_size <= 0)
        curr_size = 0;          // this process doesn't receive any data
      // because of uneven division 
      else {
        smpi_mpi_recv((char *)buff + relative_rank * scatter_size, recv_size,
                 MPI_BYTE, src, tag, comm, &status);
        curr_size = smpi_mpi_get_count(&status, MPI_BYTE);
      }
      break;
    }
    mask <<= 1;
  }

  // This process is responsible for all processes that have bits
  // set from the LSB upto (but not including) mask.  Because of
  // the "not including", we start by shifting mask back down
  // one.

  mask >>= 1;
  while (mask > 0) {
    if (relative_rank + mask < num_procs) {
      send_size = curr_size - scatter_size * mask;
      // mask is also the size of this process's subtree 

      if (send_size > 0) {
        dst = rank + mask;
        if (dst >= num_procs)
          dst -= num_procs;
        smpi_mpi_send((char *)buff + scatter_size * (relative_rank + mask),
                 send_size, MPI_BYTE, dst, tag, comm);

        curr_size -= send_size;
      }
    }
    mask >>= 1;
  }

  // done scatter now do allgather


  mask = 0x1;
  i = 0;
  while (mask < num_procs) {
    relative_dst = relative_rank ^ mask;

    dst = (relative_dst + root) % num_procs;

    /* find offset into send and recv buffers.
       zero out the least significant "i" bits of relative_rank and
       relative_dst to find root of src and dst
       subtrees. Use ranks of roots as index to send from
       and recv into  buffer */

    dst_tree_root = relative_dst >> i;
    dst_tree_root <<= i;

    my_tree_root = relative_rank >> i;
    my_tree_root <<= i;

    send_offset = my_tree_root * scatter_size;
    recv_offset = dst_tree_root * scatter_size;

    if (relative_dst < num_procs) {
      smpi_mpi_sendrecv((char *)buff + send_offset, curr_size, MPI_BYTE, dst, tag,
                   (char *)buff + recv_offset, scatter_size * mask, MPI_BYTE, dst,
                   tag, comm, &status);
      recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
      curr_size += recv_size;
    }

    /* if some processes in this process's subtree in this step
       did not have any destination process to communicate with
       because of non-power-of-two, we need to send them the
       data that they would normally have received from those
       processes. That is, the haves in this subtree must send to
       the havenots. We use a logarithmic recursive-halfing algorithm
       for this. */

    if (dst_tree_root + mask > num_procs) {
      num_procs_completed = num_procs - my_tree_root - mask;
      /* num_procs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */
      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      offset = scatter_size * (my_tree_root + mask);
      tmp_mask = mask >> 1;

      while (tmp_mask) {
        relative_dst = relative_rank ^ tmp_mask;
        dst = (relative_dst + root) % num_procs;

        tree_root = relative_rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. */

        if ((relative_dst > relative_rank)
            && (relative_rank < tree_root + num_procs_completed)
            && (relative_dst >= tree_root + num_procs_completed)) {
          smpi_mpi_send((char *)buff + offset, recv_size, MPI_BYTE, dst, tag, comm);

          /* recv_size was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((relative_dst < relative_rank)
                 && (relative_dst < tree_root + num_procs_completed)
                 && (relative_rank >= tree_root + num_procs_completed)) {

          smpi_mpi_recv((char *)buff + offset, scatter_size * num_procs_completed,
                   MPI_BYTE, dst, tag, comm, &status);

          /* num_procs_completed is also equal to the no. of processes
             whose data we don't have */
          recv_size = smpi_mpi_get_count(&status, MPI_BYTE);
          curr_size += recv_size;
        }
        tmp_mask >>= 1;
        k--;
      }
    }
    mask <<= 1;
    i++;
  }
int
smpi_coll_tuned_allgather_rdb(void *sbuf, int send_count,
                              MPI_Datatype send_type, void *rbuf,
                              int recv_count, MPI_Datatype recv_type,
                              MPI_Comm comm)
{
  // MPI variables
  MPI_Status status;
  MPI_Aint send_chunk, recv_chunk;

  // local int variables
  unsigned int i, j, k, dst, send_offset, recv_offset, tree_root;
  int dst_tree_root, rank_tree_root, last_recv_count = 0, num_procs_completed;
  int offset, tmp_mask;
  int tag = COLL_TAG_ALLGATHER;
  int mask = 1;
  int success = 0;
  int curr_count = recv_count;

  // local string variables
  char *send_ptr = (char *) sbuf;
  char *recv_ptr = (char *) rbuf;

  // get size of the communicator, followed by rank 
  unsigned int num_procs = smpi_comm_size(comm);
  unsigned int rank = smpi_comm_rank(comm);

  // get size of single element's type for send buffer and recv buffer
  send_chunk = smpi_datatype_get_extent(send_type);
  recv_chunk = smpi_datatype_get_extent(recv_type);

  // multiply size of each element by number of elements to send or recv
  send_chunk *= send_count;
  recv_chunk *= recv_count;

  // perform a local copy
  smpi_mpi_sendrecv(send_ptr, send_count, send_type, rank, tag,
               recv_ptr + rank * recv_chunk, recv_count, recv_type, rank, tag,
               comm, &status);

  i = 0;
  while (mask < num_procs) {
    dst = rank ^ mask;
    dst_tree_root = dst >> i;
    dst_tree_root <<= i;
    rank_tree_root = rank >> i;
    rank_tree_root <<= i;
    send_offset = rank_tree_root * send_chunk;
    recv_offset = dst_tree_root * recv_chunk;

    if (dst < num_procs) {
      smpi_mpi_sendrecv(recv_ptr + send_offset, curr_count, send_type, dst,
                   tag, recv_ptr + recv_offset, mask * recv_count,
                   recv_type, dst, tag, comm, &status);
      last_recv_count = smpi_mpi_get_count(&status, recv_type);
      curr_count += last_recv_count;
    }

    if (dst_tree_root + mask > num_procs) {
      num_procs_completed = num_procs - rank_tree_root - mask;
      /* num_procs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */

      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      offset = recv_chunk * (rank_tree_root + mask);
      tmp_mask = mask >> 1;

      while (tmp_mask) {
        dst = rank ^ tmp_mask;

        tree_root = rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. at any step, multiple processes
           can send if they have the data */
        if ((dst > rank)
            && (rank < tree_root + num_procs_completed)
            && (dst >= tree_root + num_procs_completed)) {
          smpi_mpi_send(recv_ptr + offset, last_recv_count, recv_type, dst,
                   tag, comm);

          /* last_recv_cnt was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((dst < rank)
                 && (dst < tree_root + num_procs_completed)
                 && (rank >= tree_root + num_procs_completed)) {
          smpi_mpi_recv(recv_ptr + offset,
                   recv_count * num_procs_completed,
                   recv_type, dst, tag, comm, &status);
          // num_procs_completed is also equal to the no. of processes
          // whose data we don't have
          last_recv_count = smpi_mpi_get_count(&status, recv_type);
          curr_count += last_recv_count;
        }
        tmp_mask >>= 1;
        k--;
      }
    }

    mask <<= 1;
    i++;
  }