예제 #1
0
int Coll_alltoallv_ring::alltoallv(const void* send_buff, const int* send_counts, const int* send_disps, MPI_Datatype send_type,
                                   void* recv_buff, const int* recv_counts, const int* recv_disps, MPI_Datatype recv_type,
                                   MPI_Comm comm)
{
  MPI_Status s;
  MPI_Aint send_chunk, recv_chunk;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();
  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();
  int pof2 = ((num_procs != 0) && ((num_procs & (~num_procs + 1)) == num_procs));
  for (i = 0; i < num_procs; i++) {

    if (pof2 == 1) {
      /* use exclusive-or algorithm */
      src = dst = rank ^ i;
    } else {
      src = (rank - i + num_procs) % num_procs;
      dst = (rank + i) % num_procs;
    }

    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst,
                 tag, recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type,
                 src, tag, comm, &s);

  }
  return MPI_SUCCESS;
}
예제 #2
0
int Coll_alltoallv_pair::alltoallv(void *send_buff, int *send_counts, int *send_disps,
                                  MPI_Datatype send_type,
                                  void *recv_buff, int *recv_counts, int *recv_disps,
                                  MPI_Datatype recv_type, MPI_Comm comm)
{

  MPI_Aint send_chunk, recv_chunk;
  MPI_Status s;
  int i, src, dst, rank, num_procs;
  int tag = COLL_TAG_ALLTOALLV;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  rank = comm->rank();
  num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "alltoallv pair algorithm can't be used with non power of two number of processes ! ");

  send_chunk = send_type->get_extent();
  recv_chunk = recv_type->get_extent();

  for (i = 0; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr + send_disps[dst] * send_chunk, send_counts[dst], send_type, dst, tag,
                      recv_ptr + recv_disps[src] * recv_chunk, recv_counts[src], recv_type, src, tag, comm, &s);
  }
  return MPI_SUCCESS;
}
예제 #3
0
int
Coll_reduce_flat_tree::reduce(const void *sbuf, void *rbuf, int count,
                                 MPI_Datatype dtype, MPI_Op op,
                                 int root, MPI_Comm comm)
{
  int i, tag = COLL_TAG_REDUCE;
  int size;
  int rank;
  MPI_Aint extent;
  unsigned char* origin = nullptr;
  const unsigned char* inbuf;
  MPI_Status status;

  rank = comm->rank();
  size = comm->size();

  /* If not root, send data to the root. */
  extent = dtype->get_extent();

  if (rank != root) {
    Request::send(sbuf, count, dtype, root, tag, comm);
    return 0;
  }

  /* Root receives and reduces messages.  Allocate buffer to receive
     messages. */

  if (size > 1)
    origin = smpi_get_tmp_recvbuffer(count * extent);

  /* Initialize the receive buffer. */
  if (rank == (size - 1))
    Request::sendrecv(sbuf, count, dtype, rank, tag,
                 rbuf, count, dtype, rank, tag, comm, &status);
  else
    Request::recv(rbuf, count, dtype, size - 1, tag, comm, &status);

  /* Loop receiving and calling reduction function (C or Fortran). */

  for (i = size - 2; i >= 0; --i) {
    if (rank == i)
      inbuf = static_cast<const unsigned char*>(sbuf);
    else {
      Request::recv(origin, count, dtype, i, tag, comm, &status);
      inbuf = origin;
    }

    /* Call reduction function. */
    if(op!=MPI_OP_NULL) op->apply( inbuf, rbuf, &count, dtype);

  }

  smpi_free_tmp_buffer(origin);

  /* All done */
  return 0;
}
예제 #4
0
int PMPI_Type_extent(MPI_Datatype datatype, MPI_Aint * extent)
{
  if (datatype == MPI_DATATYPE_NULL) {
    return MPI_ERR_TYPE;
  } else if (extent == nullptr) {
    return MPI_ERR_ARG;
  } else {
    *extent = datatype->get_extent();
    return MPI_SUCCESS;
  }
}
예제 #5
0
int
Coll_bcast_flattree_pipeline::bcast(void *buff, int count,
                                        MPI_Datatype data_type, int root,
                                        MPI_Comm comm)
{
  int i, j, rank, num_procs;
  int tag = COLL_TAG_BCAST;

  MPI_Aint extent;
  extent = data_type->get_extent();

  int segment = flattree_segment_in_byte / extent;
  segment =  segment == 0 ? 1 :segment;
  int pipe_length = count / segment;
  int increment = segment * extent;
  if (pipe_length==0) {
    XBT_WARN("MPI_bcast_flattree_pipeline use default MPI_bcast_flattree.");
    return Coll_bcast_flattree::bcast(buff, count, data_type, root, comm);
  }
  rank = comm->rank();
  num_procs = comm->size();

  MPI_Request *request_array;
  MPI_Status *status_array;

  request_array = (MPI_Request *) xbt_malloc(pipe_length * sizeof(MPI_Request));
  status_array = (MPI_Status *) xbt_malloc(pipe_length * sizeof(MPI_Status));

  if (rank != root) {
    for (i = 0; i < pipe_length; i++) {
      request_array[i] = Request::irecv((char *)buff + (i * increment), segment, data_type, root, tag, comm);
    }
    Request::waitall(pipe_length, request_array, status_array);
  }

  else {
    // Root sends data to all others
    for (j = 0; j < num_procs; j++) {
      if (j == rank)
        continue;
      else {
        for (i = 0; i < pipe_length; i++) {
          Request::send((char *)buff + (i * increment), segment, data_type, j, tag, comm);
        }
      }
    }

  }

  free(request_array);
  free(status_array);
  return MPI_SUCCESS;
}
예제 #6
0
int PMPI_Sendrecv_replace(void* buf, int count, MPI_Datatype datatype, int dst, int sendtag, int src, int recvtag,
                          MPI_Comm comm, MPI_Status* status)
{
  int retval = 0;
  if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    return MPI_ERR_TYPE;
  } else if (count < 0) {
    return MPI_ERR_COUNT;
  } else {
    int size = datatype->get_extent() * count;
    void* recvbuf = xbt_new0(char, size);
    retval = MPI_Sendrecv(buf, count, datatype, dst, sendtag, recvbuf, count, datatype, src, recvtag, comm, status);
    if(retval==MPI_SUCCESS){
      simgrid::smpi::Datatype::copy(recvbuf, count, datatype, buf, count, datatype);
    }
    xbt_free(recvbuf);

  }
  return retval;
}
예제 #7
0
int Coll_scatter_ompi::scatter(const void *sbuf, int scount,
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount,
                                            MPI_Datatype rdtype,
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("Coll_scatter_ompi::scatter");

    communicator_size = comm->size();
    rank = comm->rank();
    // Determine block size
    if (root == rank) {
        dsize=sdtype->size();
        block_size = dsize * scount;
    } else {
        dsize=rdtype->size();
        block_size = dsize * rcount;
    }

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
      std::unique_ptr<unsigned char[]> tmp_buf;
      if (rank != root) {
        tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
        sbuf   = tmp_buf.get();
        scount = rcount;
        sdtype = rdtype;
      }
      return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
    }
    return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
                                                       root, comm);
}
예제 #8
0
int
Coll_allgatherv_pair::allgatherv(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int *recv_counts, int *recv_disps, MPI_Datatype recv_type,
                               MPI_Comm comm)
{

  MPI_Aint extent;
  unsigned int i, src, dst;
  int tag = COLL_TAG_ALLGATHERV;
  MPI_Status status;

  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  unsigned int rank = comm->rank();
  unsigned int num_procs = comm->size();

  if((num_procs&(num_procs-1)))
    THROWF(arg_error,0, "allgatherv pair algorithm can't be used with non power of two number of processes ! ");

  extent = send_type->get_extent();

  // local send/recv
  Request::sendrecv(send_ptr, send_count, send_type, rank, tag,
               recv_ptr + recv_disps[rank] * extent,
               recv_counts[rank], recv_type, rank, tag, comm, &status);
  for (i = 1; i < num_procs; i++) {
    src = dst = rank ^ i;
    Request::sendrecv(send_ptr, send_count, send_type, dst, tag,
                 recv_ptr + recv_disps[src] * extent, recv_counts[src], recv_type,
                 src, tag, comm, &status);
  }

  return MPI_SUCCESS;
}
예제 #9
0
int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = comm->rank();
  comm_size = comm->size();

  extent = datatype->get_extent();

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative =  (op==MPI_OP_NULL || op->is_commutative());
  mask = 1;

  int lroot;
  if (is_commutative)
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  datatype->extent(&true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);

  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        Request::recv(tmp_buf, count, datatype, source, tag, comm, &status);

        if (is_commutative) {
          if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype);
        } else {
          if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype);
          Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      Request::send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (not is_commutative && (root != 0)) {
    if (rank == 0){
      Request::send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      Request::recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
    smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
예제 #10
0
int PMPI_Type_create_subarray(int ndims, const int* array_of_sizes,
                             const int* array_of_subsizes, const int* array_of_starts,
                             int order, MPI_Datatype oldtype, MPI_Datatype *newtype) {
  if (ndims<0){
    return MPI_ERR_COUNT;
  } else if (ndims==0){
    *newtype = MPI_DATATYPE_NULL;
    return MPI_SUCCESS;
  } else if (ndims==1){
    simgrid::smpi::Datatype::create_contiguous( array_of_subsizes[0], oldtype, array_of_starts[0]*oldtype->get_extent(), newtype);
    return MPI_SUCCESS;
  } else if (oldtype == MPI_DATATYPE_NULL || not oldtype->is_valid() ) {
    return MPI_ERR_TYPE;
  } else if (order != MPI_ORDER_FORTRAN && order != MPI_ORDER_C){
    return MPI_ERR_ARG;
  } else {
    return simgrid::smpi::Datatype::create_subarray(ndims, array_of_sizes, array_of_subsizes, array_of_starts, order, oldtype, newtype);
  }
}
예제 #11
0
int Coll_bcast_SMP_binary::bcast(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = datatype->get_extent();

  rank = comm->rank();
  size = comm->size();
  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int host_num_core=1;
  if (comm->is_uniform()){
    host_num_core = comm->get_intra_comm()->size();
  }else{
    //implementation buggy in this case
    return Coll_bcast_mpich::bcast( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      Request::send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      Request::recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = Request::irecv(buf, count, datatype, from_inter, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          Request::send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
      }
      // case intermediate
      else {
        request = Request::irecv(buf, count, datatype, from_intra, tag, comm);
        Request::wait(&request, &status);
        Request::send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          Request::send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        Request::waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = Request::irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          Request::wait(&request_array[i], &status);
          Request::send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            Request::send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");
    Colls::bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
예제 #12
0
int Coll_alltoall_2dmesh::alltoall(void *send_buff, int send_count,
                                    MPI_Datatype send_type,
                                    void *recv_buff, int recv_count,
                                    MPI_Datatype recv_type, MPI_Comm comm)
{
  MPI_Status *statuses, s;
  MPI_Request *reqs, *req_ptr;;
  MPI_Aint extent;

  char *tmp_buff1, *tmp_buff2;
  int i, j, src, dst, rank, num_procs, count, num_reqs;
  int X, Y, send_offset, recv_offset;
  int my_row_base, my_col_base, src_row_base, block_size;
  int tag = COLL_TAG_ALLTOALL;

  rank = comm->rank();
  num_procs = comm->size();
  extent = send_type->get_extent();

  if (not alltoall_check_is_2dmesh(num_procs, &X, &Y))
    return MPI_ERR_OTHER;

  my_row_base = (rank / Y) * Y;
  my_col_base = rank % Y;

  block_size = extent * send_count;

  tmp_buff1 = (char *) smpi_get_tmp_sendbuffer(block_size * num_procs * Y);
  tmp_buff2 = (char *) smpi_get_tmp_recvbuffer(block_size * Y);

  num_reqs = X;
  if (Y > X)
    num_reqs = Y;

  statuses = (MPI_Status *) xbt_malloc(num_reqs * sizeof(MPI_Status));
  reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));

  req_ptr = reqs;

  count = send_count * num_procs;

  for (i = 0; i < Y; i++) {
    src = i + my_row_base;
    if (src == rank)
      continue;

    recv_offset = (src % Y) * block_size * num_procs;
    *(req_ptr++) = Request::irecv(tmp_buff1 + recv_offset, count, recv_type, src, tag, comm);
  }

  for (i = 0; i < Y; i++) {
    dst = i + my_row_base;
    if (dst == rank)
      continue;
    Request::send(send_buff, count, send_type, dst, tag, comm);
  }

  Request::waitall(Y - 1, reqs, statuses);
  req_ptr = reqs;

  for (i = 0; i < Y; i++) {
    send_offset = (rank * block_size) + (i * block_size * num_procs);
    recv_offset = (my_row_base * block_size) + (i * block_size);

    if (i + my_row_base == rank)
      Request::sendrecv((char *) send_buff + recv_offset, send_count, send_type,
                   rank, tag,
                   (char *) recv_buff + recv_offset, recv_count, recv_type,
                   rank, tag, comm, &s);

    else
      Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type,
                   rank, tag,
                   (char *) recv_buff + recv_offset, recv_count, recv_type,
                   rank, tag, comm, &s);
  }


  for (i = 0; i < X; i++) {
    src = (i * Y + my_col_base);
    if (src == rank)
      continue;
    src_row_base = (src / Y) * Y;

    *(req_ptr++) = Request::irecv((char *) recv_buff + src_row_base * block_size, recv_count * Y,
              recv_type, src, tag, comm);
  }

  for (i = 0; i < X; i++) {
    dst = (i * Y + my_col_base);
    if (dst == rank)
      continue;

    recv_offset = 0;
    for (j = 0; j < Y; j++) {
      send_offset = (dst + j * num_procs) * block_size;

      if (j + my_row_base == rank)
        Request::sendrecv((char *) send_buff + dst * block_size, send_count,
                     send_type, rank, tag, tmp_buff2 + recv_offset, recv_count,
                     recv_type, rank, tag, comm, &s);
      else
        Request::sendrecv(tmp_buff1 + send_offset, send_count, send_type,
                     rank, tag,
                     tmp_buff2 + recv_offset, recv_count, recv_type,
                     rank, tag, comm, &s);

      recv_offset += block_size;
    }

    Request::send(tmp_buff2, send_count * Y, send_type, dst, tag, comm);
  }
  Request::waitall(X - 1, reqs, statuses);
  free(reqs);
  free(statuses);
  smpi_free_tmp_buffer(tmp_buff1);
  smpi_free_tmp_buffer(tmp_buff2);
  return MPI_SUCCESS;
}
예제 #13
0
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    const unsigned char* in_buf = nullptr;
    unsigned char *out_buf = nullptr, *tmp_buf = nullptr;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0;

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=Coll_reduce_mpich::reduce;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    my_rank = comm->rank();
    total_size = comm->size();
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();
    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);

    is_commutative= (op==MPI_OP_NULL || op->is_commutative());

    datatype->extent(&true_lb,
                                       &true_extent);
    extent =datatype->get_extent();
    stride = count * std::max(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
              tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
              tmp_buf = tmp_buf - true_lb;
            }

            if (sendbuf != MPI_IN_PLACE) {
              in_buf = static_cast<const unsigned char*>(sendbuf);
            } else {
              in_buf = static_cast<const unsigned char*>(recvbuf);
            }

            if (local_rank == 0) {
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else {
                   out_buf = static_cast<unsigned char*>(recvbuf);
                   if (in_buf == out_buf) {
                     in_buf  = static_cast<const unsigned char*>(MPI_IN_PLACE);
                     out_buf = static_cast<unsigned char*>(recvbuf);
                     }
                 }
            } else {
              in_buf  = static_cast<const unsigned char*>(sendbuf);
              out_buf = nullptr;
            }

            if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
              mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            } else {
              mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            }

            if (local_rank == 0 && root != my_rank) {
                Request::send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                Request::recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) {
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
            } else {
                reduce_fn = &MPIR_Reduce_binomial_MV2;
            }
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if (tmp_buf != nullptr)
          smpi_free_tmp_buffer(tmp_buf + true_lb);
        goto fn_exit;
    }


    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
        tmp_buf          = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
        tmp_buf          = tmp_buf - true_lb;
    }
    if (sendbuf != MPI_IN_PLACE) {
      in_buf = static_cast<const unsigned char*>(sendbuf);
    } else {
      in_buf = static_cast<const unsigned char*>(recvbuf);
    }
    if (local_rank == 0) {
      out_buf = static_cast<unsigned char*>(tmp_buf);
    } else {
      out_buf = nullptr;
    }


    if(local_size > 1) {
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
          if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else {
      smpi_free_tmp_buffer(tmp_buf + true_lb);
      tmp_buf = (unsigned char*)in_buf; // xxx
    }

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the
                 * root of the reduce op. So, I will write the
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) {
                  in_buf  = tmp_buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                } else {

                  unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
                  Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
                  // in_buf = MPI_IN_PLACE;
                  in_buf  = buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                }
            } else {
              unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
              Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
              // in_buf = MPI_IN_PLACE;
              in_buf  = buf;
              out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = nullptr;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
      /* Send the message to the root if the leader is not the
       * root of the reduce operation. The reduced data is in tmp_buf */
      if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) {
        Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm);
      }
      if ((local_rank != 0) && (root == my_rank)) {
        Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE);
      }
      smpi_free_tmp_buffer(tmp_buf + true_lb);

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
예제 #14
0
int Coll_allgather_SMP_NTS::allgather(const void *sbuf, int scount,
                                      MPI_Datatype stype, void *rbuf,
                                      int rcount, MPI_Datatype rtype,
                                      MPI_Comm comm)
{
  int src, dst, comm_size, rank;
  comm_size = comm->size();
  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  int tag = COLL_TAG_ALLGATHER;

  int i, send_offset, recv_offset;
  int intra_rank, inter_rank;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }


  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  int inter_comm_size = (comm_size + num_core - 1) / num_core;
  int num_core_in_current_smp = num_core;

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather SMP NTS algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ", num_core);

  /* for too small number of processes, use default implementation */
  if (comm_size <= num_core) {
    XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");
    Coll_allgather_default::allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;
  }

  // the last SMP node may have fewer number of running processes than all others
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * num_core);
  }
  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
               MPI_STATUS_IGNORE);

  //gather to root of each SMP

  for (i = 1; i < num_core_in_current_smp; i++) {

    dst =
        (inter_rank * num_core) + (intra_rank + i) % (num_core_in_current_smp);
    src =
        (inter_rank * num_core) + (intra_rank - i +
                                   num_core_in_current_smp) %
        (num_core_in_current_smp);
    recv_offset = src * rextent * rcount;

    Request::sendrecv(sbuf, scount, stype, dst, tag,
                 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
                 MPI_STATUS_IGNORE);

  }

  // INTER-SMP-ALLGATHER
  // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
  // Use logical ring algorithm

  // root of each SMP
  if (intra_rank == 0) {
    MPI_Request* rrequest_array = new MPI_Request[inter_comm_size - 1];
    MPI_Request* srequest_array = new MPI_Request[inter_comm_size - 1];

    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * num_core;
    dst = ((inter_rank + 1) % inter_comm_size) * num_core;

    // post all inter Irecv
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      rrequest_array[i] = Request::irecv((char *)rbuf + recv_offset, rcount * num_core,
                                         rtype, src, tag + i, comm);
    }

    // send first message
    send_offset =
        ((inter_rank +
          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
    srequest_array[0] = Request::isend((char *)rbuf + send_offset, scount * num_core,
                                       stype, dst, tag, comm);

    // loop : recv-inter , send-inter, send-intra (linear-bcast)
    for (i = 0; i < inter_comm_size - 2; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE);
      srequest_array[i + 1] = Request::isend((char *)rbuf + recv_offset, scount * num_core,
                                             stype, dst, tag + i + 1, comm);
      if (num_core_in_current_smp > 1) {
        Request::send((char *)rbuf + recv_offset, scount * num_core,
                      stype, (rank + 1), tag + i + 1, comm);
      }
    }

    // recv last message and send_intra
    recv_offset =
        ((inter_rank - i - 1 +
          inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
    //recv_offset = ((inter_rank + 1) % inter_comm_size) * num_core * sextent * scount;
    //i=inter_comm_size-2;
    Request::wait(&rrequest_array[i], MPI_STATUS_IGNORE);
    if (num_core_in_current_smp > 1) {
      Request::send((char *)rbuf + recv_offset, scount * num_core,
                                  stype, (rank + 1), tag + i + 1, comm);
    }

    Request::waitall(inter_comm_size - 1, srequest_array, MPI_STATUSES_IGNORE);
    delete[] rrequest_array;
    delete[] srequest_array;
  }
  // last rank of each SMP
  else if (intra_rank == (num_core_in_current_smp - 1)) {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                    rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
    }
  }
  // intermediate rank of each SMP
  else {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * num_core * sextent * scount;
      Request::recv((char *) rbuf + recv_offset, (rcount * num_core), rtype,
                    rank - 1, tag + i + 1, comm, MPI_STATUS_IGNORE);
      Request::send((char *) rbuf + recv_offset, (scount * num_core), stype,
                    (rank + 1), tag + i + 1, comm);
    }
  }

  return MPI_SUCCESS;
}
예제 #15
0
int Coll_allgather_loosely_lr::allgather(const void *sbuf, int scount,
                                         MPI_Datatype stype, void *rbuf,
                                         int rcount, MPI_Datatype rtype,
                                         MPI_Comm comm)
{
  int comm_size, rank;
  int tag = COLL_TAG_ALLGATHER;
  int i, j, send_offset, recv_offset;
  int intra_rank, inter_rank, inter_comm_size, intra_comm_size;
  int inter_dst, inter_src;

  comm_size = comm->size();

if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }
  int num_core=1;
  if (comm->is_uniform()){
    num_core = comm->get_intra_comm()->size();
  }

  if(comm_size%num_core)
    THROWF(arg_error,0, "allgather loosely lr algorithm can't be used with non multiple of NUM_CORE=%d number of processes ! ",num_core);

  rank = comm->rank();
  MPI_Aint rextent, sextent;
  rextent = rtype->get_extent();
  sextent = stype->get_extent();
  MPI_Request inter_rrequest;
  MPI_Request rrequest_array[128];
  MPI_Request srequest_array[128];
  MPI_Request inter_srequest_array[128];


  int rrequest_count = 0;
  int srequest_count = 0;
  int inter_srequest_count = 0;

  MPI_Status status;

  intra_rank = rank % num_core;
  inter_rank = rank / num_core;
  inter_comm_size = (comm_size + num_core - 1) / num_core;
  intra_comm_size = num_core;

  int src_seg, dst_seg;

  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  Request::sendrecv(sbuf, scount, stype, rank, tag,
               (char *)rbuf + recv_offset, rcount, rtype, rank, tag, comm, &status);

  int dst, src;
  int inter_send_offset, inter_recv_offset;

  rrequest_count = 0;
  srequest_count = 0;
  inter_srequest_count = 0;

  for (i = 0; i < inter_comm_size; i++) {

    // inter_communication

    inter_dst = (rank + intra_comm_size) % comm_size;
    inter_src = (rank - intra_comm_size + comm_size) % comm_size;

    src_seg =
        ((inter_rank - 1 - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;
    dst_seg =
        ((inter_rank - i +
          inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

    inter_send_offset = dst_seg * sextent * scount;
    inter_recv_offset = src_seg * rextent * rcount;

    for (j = 0; j < intra_comm_size; j++) {

      // inter communication
      if (intra_rank == j) {
        if (i != inter_comm_size - 1) {

          inter_rrequest = Request::irecv((char*)rbuf + inter_recv_offset, rcount, rtype, inter_src, tag, comm);
          inter_srequest_array[inter_srequest_count++] =
              Request::isend((char*)rbuf + inter_send_offset, scount, stype, inter_dst, tag, comm);
        }
      }
      //intra_communication
      src = inter_rank * intra_comm_size + j;
      dst = inter_rank * intra_comm_size + j;

      src_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + j;
      dst_seg =
          ((inter_rank - i +
            inter_comm_size) % inter_comm_size) * intra_comm_size + intra_rank;

      send_offset = dst_seg * sextent * scount;
      recv_offset = src_seg * rextent * rcount;


      if (j != intra_rank) {

        rrequest_array[rrequest_count++] = Request::irecv((char *)rbuf + recv_offset, rcount, rtype, src, tag, comm);
        srequest_array[srequest_count++] = Request::isend((char *)rbuf + send_offset, scount, stype, dst, tag, comm);

      }
    }                           // intra loop


    // wait for inter communication to finish for these rounds (# of round equals num_core)
    if (i != inter_comm_size - 1) {
      Request::wait(&inter_rrequest, &status);
    }

  }                             //inter loop

  Request::waitall(rrequest_count, rrequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(srequest_count, srequest_array, MPI_STATUSES_IGNORE);
  Request::waitall(inter_srequest_count, inter_srequest_array, MPI_STATUSES_IGNORE);

  return MPI_SUCCESS;
}
예제 #16
0
Type_Vector::Type_Vector(int size, MPI_Aint lb, MPI_Aint ub, int flags, int count, int block_length, int stride,
                         MPI_Datatype old_type)
    : Type_Hvector(size, lb, ub, flags, count, block_length, stride * old_type->get_extent(), old_type)
{
}
예제 #17
0
Type_Indexed::Type_Indexed(int size, MPI_Aint lb, MPI_Aint ub, int flags, int count, const int* block_lengths,
                           const int* block_indices, MPI_Datatype old_type)
    : Type_Hindexed(size, lb, ub, flags, count, block_lengths, block_indices, old_type, old_type->get_extent())
{
}
예제 #18
0
int Coll_allgather_mvapich2_smp::allgather(void *sendbuf,int sendcnt, MPI_Datatype sendtype,
                            void *recvbuf, int recvcnt,MPI_Datatype recvtype,
                            MPI_Comm  comm)
{
    int rank, size;
    int local_rank, local_size;
    int leader_comm_size = 0;
    int mpi_errno = MPI_SUCCESS;
    MPI_Aint recvtype_extent = 0;  /* Datatype extent */
    MPI_Comm shmem_comm, leader_comm;

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  if (not comm->is_uniform() || not comm->is_blocked())
    THROWF(arg_error,0, "allgather MVAPICH2 smp algorithm can't be used with irregular deployment. Please insure that processes deployed on the same node are contiguous and that each node has the same number of processes");

    if (recvcnt == 0) {
        return MPI_SUCCESS;
    }

    rank = comm->rank();
    size = comm->size();

    /* extract the rank,size information for the intra-node communicator */
    recvtype_extent=recvtype->get_extent();

    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader communicator */
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
    }

    /*If there is just one node, after gather itself,
     * root has all the data and it can do bcast*/
    if(local_rank == 0) {
        mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                    (void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                     recvcnt, recvtype,
                                     0, shmem_comm);
    } else {
        /*Since in allgather all the processes could have
         * its own data in place*/
        if(sendbuf == MPI_IN_PLACE) {
            mpi_errno = Colls::gather((void*)((char*)recvbuf + (rank * recvcnt * recvtype_extent)),
                                         recvcnt , recvtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        } else {
            mpi_errno = Colls::gather(sendbuf, sendcnt,sendtype,
                                         recvbuf, recvcnt, recvtype,
                                         0, shmem_comm);
        }
    }
    /* Exchange the data between the node leaders*/
    if (local_rank == 0 && (leader_comm_size > 1)) {
        /*When data in each socket is different*/
        if (comm->is_uniform() != 1) {

            int *displs = NULL;
            int *recvcnts = NULL;
            int *node_sizes = NULL;
            int i = 0;

            node_sizes = comm->get_non_uniform_map();

            displs =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            recvcnts =  static_cast<int *>(xbt_malloc(sizeof (int) * leader_comm_size));
            if (not displs || not recvcnts) {
              return MPI_ERR_OTHER;
            }
            recvcnts[0] = node_sizes[0] * recvcnt;
            displs[0] = 0;

            for (i = 1; i < leader_comm_size; i++) {
                displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt;
                recvcnts[i] = node_sizes[i] * recvcnt;
            }


            void* sendbuf=((char*)recvbuf)+recvtype->get_extent()*displs[leader_comm->rank()];

            mpi_errno = Colls::allgatherv(sendbuf,
                                       (recvcnt*local_size),
                                       recvtype,
                                       recvbuf, recvcnts,
                                       displs, recvtype,
                                       leader_comm);
            xbt_free(displs);
            xbt_free(recvcnts);
        } else {
        void* sendtmpbuf=((char*)recvbuf)+recvtype->get_extent()*(recvcnt*local_size)*leader_comm->rank();



            mpi_errno = Coll_allgather_mpich::allgather(sendtmpbuf,
                                               (recvcnt*local_size),
                                               recvtype,
                                               recvbuf, (recvcnt*local_size), recvtype,
                                             leader_comm);

        }
    }

    /*Bcast the entire data from node leaders to all other cores*/
    mpi_errno = Colls::bcast (recvbuf, recvcnt * size, recvtype, 0, shmem_comm);
    return mpi_errno;
}
예제 #19
0
int
Coll_allreduce_lr::allreduce(void *sbuf, void *rbuf, int rcount,
                             MPI_Datatype dtype, MPI_Op op, MPI_Comm comm)
{
  int tag = COLL_TAG_ALLREDUCE;
  MPI_Status status;
  int rank, i, size, count;
  int send_offset, recv_offset;
  int remainder, remainder_flag, remainder_offset;

  rank = comm->rank();
  size = comm->size();

  /* make it compatible with all data type */
  MPI_Aint extent;
  extent = dtype->get_extent();

  /* when communication size is smaller than number of process (not support) */
  if (rcount < size) {
    XBT_WARN("MPI_allreduce_lr use default MPI_allreduce.");
    Coll_allreduce_default::allreduce(sbuf, rbuf, rcount, dtype, op, comm);
    return MPI_SUCCESS;
  }

  /* when communication size is not divisible by number of process:
     call the native implementation for the remain chunk at the end of the operation */
  if (rcount % size != 0) {
    remainder = rcount % size;
    remainder_flag = 1;
    remainder_offset = (rcount / size) * size * extent;
  } else {
    remainder = remainder_flag = remainder_offset = 0;
  }

  /* size of each point-to-point communication is equal to the size of the whole message
     divided by number of processes
   */
  count = rcount / size;

  /* our ALL-REDUCE implementation
     1. copy (partial of)send_buf to recv_buf
     2. use logical ring reduce-scatter
     3. use logical ring all-gather
   */

  // copy partial data
  send_offset = ((rank - 1 + size) % size) * count * extent;
  recv_offset = ((rank - 1 + size) % size) * count * extent;
  Request::sendrecv((char *) sbuf + send_offset, count, dtype, rank, tag - 1,
               (char *) rbuf + recv_offset, count, dtype, rank, tag - 1, comm,
               &status);

  // reduce-scatter
  for (i = 0; i < (size - 1); i++) {
    send_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
    recv_offset = ((rank - 2 - i + 2 * size) % size) * count * extent;
    //    recv_offset = ((rank-i+2*size)%size)*count*extent;
    Request::sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                 tag + i, (char *) rbuf + recv_offset, count, dtype,
                 ((rank + size - 1) % size), tag + i, comm, &status);

    // compute result to rbuf+recv_offset
    if(op!=MPI_OP_NULL) op->apply( (char *) sbuf + recv_offset, (char *) rbuf + recv_offset,
                   &count, dtype);
  }

  // all-gather
  for (i = 0; i < (size - 1); i++) {
    send_offset = ((rank - i + 2 * size) % size) * count * extent;
    recv_offset = ((rank - 1 - i + 2 * size) % size) * count * extent;
    Request::sendrecv((char *) rbuf + send_offset, count, dtype, ((rank + 1) % size),
                 tag + i, (char *) rbuf + recv_offset, count, dtype,
                 ((rank + size - 1) % size), tag + i, comm, &status);
  }

  /* when communication size is not divisible by number of process:
     call the native implementation for the remain chunk at the end of the operation */
  if (remainder_flag) {
    return Colls::allreduce((char *) sbuf + remainder_offset,
                         (char *) rbuf + remainder_offset, remainder, dtype, op,
                         comm);
  }

  return 0;
}