int Coll_alltoall_basic_linear::alltoall(void *sendbuf, int sendcount, MPI_Datatype sendtype,
                                          void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm)
{
  int system_tag = 888;
  int i;
  int count;
  MPI_Aint lb = 0, sendext = 0, recvext = 0;
  MPI_Request *requests;

  /* Initialize. */
  int rank = comm->rank();
  int size = comm->size();
  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
  sendtype->extent(&lb, &sendext);
  recvtype->extent(&lb, &recvext);
  /* simple optimization */
  int err = Datatype::copy(static_cast<char *>(sendbuf) + rank * sendcount * sendext, sendcount, sendtype,
                               static_cast<char *>(recvbuf) + rank * recvcount * recvext, recvcount, recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    /* Post all receives first -- a simple optimization */
    count = 0;
    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
      requests[count] = Request::irecv_init(static_cast<char *>(recvbuf) + i * recvcount * recvext, recvcount,
                                        recvtype, i, system_tag, comm);
      count++;
    }
    /* Now post all sends in reverse order
     *   - We would like to minimize the search time through message queue
     *     when messages actually arrive in the order in which they were posted.
     * TODO: check the previous assertion
     */
    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
      requests[count] = Request::isend_init(static_cast<char *>(sendbuf) + i * sendcount * sendext, sendcount,
                                        sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    Request::startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    Request::waitall(count, requests, MPI_STATUS_IGNORE);
    for(i = 0; i < count; i++) {
      if(requests[i]!=MPI_REQUEST_NULL)
        Request::unref(&requests[i]);
    }
    xbt_free(requests);
  }
  return err;
}
Example #2
0
int PMPI_Type_get_extent(MPI_Datatype datatype, MPI_Aint * lb, MPI_Aint * extent)
{
  if (datatype == MPI_DATATYPE_NULL) {
    return MPI_ERR_TYPE;
  } else if (lb == nullptr || extent == nullptr) {
    return MPI_ERR_ARG;
  } else {
    return datatype->extent(lb, extent);
  }
}
Example #3
0
/*
 *  gather_intra
 *
 *  Function:  - basic gather operation
 *  Accepts:  - same arguments as MPI_Gather()
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_basic_linear::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
                                          MPI_Datatype rdtype, int root, MPI_Comm comm)
{
    int i;
    int err;
    int rank;
    int size;
    char *ptmp;
    MPI_Aint incr;
    MPI_Aint extent;
    MPI_Aint lb;

    size = comm->size();
    rank = comm->rank();

    /* Everyone but root sends data and returns. */
    XBT_DEBUG("ompi_coll_tuned_gather_intra_basic_linear rank %d", rank);

    if (rank != root) {
        Request::send(sbuf, scount, sdtype, root,
                                 COLL_TAG_GATHER,
                                  comm);
        return MPI_SUCCESS;
    }

    /* I am the root, loop receiving the data. */

    rdtype->extent(&lb, &extent);
    incr = extent * rcount;
    for (i = 0, ptmp = (char *) rbuf; i < size; ++i, ptmp += incr) {
        if (i == rank) {
            if (MPI_IN_PLACE != sbuf) {
                err = Datatype::copy(sbuf, scount, sdtype,
                                      ptmp, rcount, rdtype);
            } else {
                err = MPI_SUCCESS;
            }
        } else {
            Request::recv(ptmp, rcount, rdtype, i,
                                    COLL_TAG_GATHER,
                                    comm, MPI_STATUS_IGNORE);
            err = MPI_SUCCESS;
        }
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
Example #4
0
int Coll_reduce_binomial::reduce(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = comm->rank();
  comm_size = comm->size();

  extent = datatype->get_extent();

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative =  (op==MPI_OP_NULL || op->is_commutative());
  mask = 1;

  int lroot;
  if (is_commutative)
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  datatype->extent(&true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);

  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void*)smpi_get_tmp_recvbuffer(count * std::max(extent, true_extent));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      Datatype::copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        Request::recv(tmp_buf, count, datatype, source, tag, comm, &status);

        if (is_commutative) {
          if(op!=MPI_OP_NULL) op->apply( tmp_buf, recvbuf, &count, datatype);
        } else {
          if(op!=MPI_OP_NULL) op->apply( recvbuf, tmp_buf, &count, datatype);
          Datatype::copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      Request::send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (not is_commutative && (root != 0)) {
    if (rank == 0){
      Request::send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      Request::recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
    smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
int Coll_allreduce_mvapich2::allreduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, MPI_Comm comm)
{

  int mpi_errno = MPI_SUCCESS;
  //int rank = 0,
  int comm_size = 0;

  comm_size = comm->size();
  //rank = comm->rank();

  if (count == 0) {
      return MPI_SUCCESS;
  }

  if (mv2_allreduce_thresholds_table == NULL)
    init_mv2_allreduce_tables_stampede();

  /* check if multiple threads are calling this collective function */

  MPI_Aint sendtype_size = 0;
  long nbytes = 0;
  int is_commutative = 0;
  MPI_Aint true_lb, true_extent;

  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  datatype->extent(&true_lb, &true_extent);
  is_commutative = op->is_commutative();

  {
    int range = 0, range_threshold = 0, range_threshold_intra = 0;
    int is_two_level = 0;

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
        (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    /* skip mcast poiters if mcast is not available */
    if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
        while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
            && ((mv2_allreduce_thresholds_table[range].
                inter_leader[range_threshold].MV2_pt_Allreducection
                == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
                (mv2_allreduce_thresholds_table[range].
                    inter_leader[range_threshold].MV2_pt_Allreducection
                    == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
            )) {
            range_threshold++;
        }
    }
    while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
    && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }
    if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
        is_two_level = 1;
    }
    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
        (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
    && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
        -1)) {
        range_threshold_intra++;
    }

    MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
                                                                                .MV2_pt_Allreducection;

    MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
                                                                                    .MV2_pt_Allreducection;

    /* check if mcast is ready, otherwise replace mcast with other algorithm */
    if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
        (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
        {
          MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
        if(is_two_level != 1) {
            MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
    }

    if(is_two_level == 1){
        // check if shm is ready, if not use other algorithm first
        if (is_commutative) {
          if(comm->get_leaders_comm()==MPI_COMM_NULL){
            comm->init_smp();
          }
          mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
                                                     datatype, op, comm);
                } else {
        mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
            datatype, op, comm);
        }
    } else {
        mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
            datatype, op, comm);
    }
  }

  //comm->ch.intra_node_done=0;

  return (mpi_errno);


}
int Coll_reduce_mvapich2_two_level::reduce( const void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    const unsigned char* in_buf = nullptr;
    unsigned char *out_buf = nullptr, *tmp_buf = nullptr;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0;

    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=Coll_reduce_mpich::reduce;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=Coll_reduce_mpich::reduce;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    my_rank = comm->rank();
    total_size = comm->size();
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    leader_comm = comm->get_leaders_comm();
    int* leaders_map = comm->get_leaders_map();
    leader_of_root = comm->group()->rank(leaders_map[root]);
    leader_root = leader_comm->group()->rank(leaders_map[root]);

    is_commutative= (op==MPI_OP_NULL || op->is_commutative());

    datatype->extent(&true_lb,
                                       &true_extent);
    extent =datatype->get_extent();
    stride = count * std::max(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
              tmp_buf = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
              tmp_buf = tmp_buf - true_lb;
            }

            if (sendbuf != MPI_IN_PLACE) {
              in_buf = static_cast<const unsigned char*>(sendbuf);
            } else {
              in_buf = static_cast<const unsigned char*>(recvbuf);
            }

            if (local_rank == 0) {
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else {
                   out_buf = static_cast<unsigned char*>(recvbuf);
                   if (in_buf == out_buf) {
                     in_buf  = static_cast<const unsigned char*>(MPI_IN_PLACE);
                     out_buf = static_cast<unsigned char*>(recvbuf);
                     }
                 }
            } else {
              in_buf  = static_cast<const unsigned char*>(sendbuf);
              out_buf = nullptr;
            }

            if (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
              mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            } else {
              mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count, datatype, op, 0, shmem_comm);
            }

            if (local_rank == 0 && root != my_rank) {
                Request::send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                Request::recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) {
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2;
            } else {
                reduce_fn = &MPIR_Reduce_binomial_MV2;
            }
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if (tmp_buf != nullptr)
          smpi_free_tmp_buffer(tmp_buf + true_lb);
        goto fn_exit;
    }


    if (local_rank == 0) {
        leader_comm = comm->get_leaders_comm();
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
        tmp_buf          = smpi_get_tmp_sendbuffer(count * std::max(extent, true_extent));
        tmp_buf          = tmp_buf - true_lb;
    }
    if (sendbuf != MPI_IN_PLACE) {
      in_buf = static_cast<const unsigned char*>(sendbuf);
    } else {
      in_buf = static_cast<const unsigned char*>(recvbuf);
    }
    if (local_rank == 0) {
      out_buf = static_cast<unsigned char*>(tmp_buf);
    } else {
      out_buf = nullptr;
    }


    if(local_size > 1) {
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
          if (is_commutative == 1 && (count * (std::max(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count, datatype, op, intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else {
      smpi_free_tmp_buffer(tmp_buf + true_lb);
      tmp_buf = (unsigned char*)in_buf; // xxx
    }

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the
                 * root of the reduce op. So, I will write the
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) {
                  in_buf  = tmp_buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                } else {

                  unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
                  Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
                  // in_buf = MPI_IN_PLACE;
                  in_buf  = buf;
                  out_buf = static_cast<unsigned char*>(recvbuf);
                }
            } else {
              unsigned char* buf = smpi_get_tmp_sendbuffer(count * datatype->get_extent());
              Datatype::copy(tmp_buf, count, datatype, buf, count, datatype);
              // in_buf = MPI_IN_PLACE;
              in_buf  = buf;
              out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = nullptr;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
      /* Send the message to the root if the leader is not the
       * root of the reduce operation. The reduced data is in tmp_buf */
      if ((local_rank == 0) && (root != my_rank) && (leader_root == leader_comm_rank)) {
        Request::send(tmp_buf, count, datatype, root, COLL_TAG_REDUCE + 1, comm);
      }
      if ((local_rank != 0) && (root == my_rank)) {
        Request::recv(recvbuf, count, datatype, leader_of_root, COLL_TAG_REDUCE + 1, comm, MPI_STATUS_IGNORE);
      }
      smpi_free_tmp_buffer(tmp_buf + true_lb);

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) {
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
Example #7
0
int Coll_gather_ompi_binomial::gather(void* sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int rcount,
                                      MPI_Datatype rdtype, int root, MPI_Comm comm)
{
    int line = -1;
    int i;
    int rank;
    int vrank;
    int size;
    int total_recv = 0;
    char *ptmp     = NULL;
    char *tempbuf  = NULL;
    int err;
    ompi_coll_tree_t* bmtree;
    MPI_Status status;
    MPI_Aint sextent, slb, strue_lb, strue_extent;
    MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;


    size = comm->size();
    rank = comm->rank();

    XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d", rank);

    /* create the binomial tree */
   // COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
    bmtree = ompi_coll_tuned_topo_build_in_order_bmtree(comm, root);
    // data->cached_in_order_bmtree;

    sdtype->extent(&slb, &sextent);
    sdtype->extent(&strue_lb, &strue_extent);

    vrank = (rank - root + size) % size;

    if (rank == root) {
        rdtype->extent(&rlb, &rextent);
        rdtype->extent(&rtrue_lb, &rtrue_extent);
        if (0 == root) {
          /* root on 0, just use the recv buffer */
          ptmp = (char*)rbuf;
          if (sbuf != MPI_IN_PLACE) {
            err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          }
        } else {
          /* root is not on 0, allocate temp buffer for recv,
           * rotate data at the end */
          tempbuf = (char*)smpi_get_tmp_recvbuffer(rtrue_extent + (rcount * size - 1) * rextent);
          if (NULL == tempbuf) {
            err  = MPI_ERR_OTHER;
            line = __LINE__;
            goto err_hndl;
          }

          ptmp = tempbuf - rlb;
          if (sbuf != MPI_IN_PLACE) {
            /* copy from sbuf to temp buffer */
            err = Datatype::copy(sbuf, scount, sdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          } else {
            /* copy from rbuf to temp buffer  */
            err = Datatype::copy((char*)rbuf + rank * rextent * rcount, rcount, rdtype, ptmp, rcount, rdtype);
            if (MPI_SUCCESS != err) {
              line = __LINE__;
              goto err_hndl;
            }
          }
        }
        total_recv = rcount;
    } else if (!(vrank % 2)) {
      /* other non-leaf nodes, allocate temp buffer for data received from
       * children, the most we need is half of the total data elements due
       * to the property of binimoal tree */
      tempbuf = (char*)smpi_get_tmp_sendbuffer(strue_extent + (scount * size - 1) * sextent);
      if (NULL == tempbuf) {
        err  = MPI_ERR_OTHER;
        line = __LINE__;
        goto err_hndl;
      }

      ptmp = tempbuf - slb;
      /* local copy to tempbuf */
      err = Datatype::copy(sbuf, scount, sdtype, ptmp, scount, sdtype);
      if (MPI_SUCCESS != err) {
        line = __LINE__;
        goto err_hndl;
      }

      /* use sdtype,scount as rdtype,rdcount since they are ignored on
       * non-root procs */
      rdtype     = sdtype;
      rcount     = scount;
      rextent    = sextent;
      total_recv = rcount;
    } else {
      /* leaf nodes, no temp buffer needed, use sdtype,scount as
       * rdtype,rdcount since they are ignored on non-root procs */
      ptmp       = (char*)sbuf;
      total_recv = scount;
    }

    if (!(vrank % 2)) {
      /* all non-leaf nodes recv from children */
      for (i = 0; i < bmtree->tree_nextsize; i++) {
        int mycount = 0, vkid;
        /* figure out how much data I have to send to this child */
        vkid    = (bmtree->tree_next[i] - root + size) % size;
        mycount = vkid - vrank;
        if (mycount > (size - vkid))
          mycount = size - vkid;
        mycount *= rcount;

        XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i],
                  mycount);

        Request::recv(ptmp + total_recv * rextent, mycount, rdtype, bmtree->tree_next[i], COLL_TAG_GATHER, comm,
                      &status);

        total_recv += mycount;
      }
    }

    if (rank != root) {
      /* all nodes except root send to parents */
      XBT_DEBUG("smpi_coll_tuned_gather_ompi_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv);

      Request::send(ptmp, total_recv, sdtype, bmtree->tree_prev, COLL_TAG_GATHER, comm);
  }
    if (rank == root) {
      if (root != 0) {
        /* rotate received data on root if root != 0 */
        err = Datatype::copy(ptmp, rcount * (size - root), rdtype, (char*)rbuf + rextent * root * rcount,
                             rcount * (size - root), rdtype);
        if (MPI_SUCCESS != err) {
          line = __LINE__;
          goto err_hndl;
        }

        err = Datatype::copy(ptmp + rextent * rcount * (size - root), rcount * root, rdtype, (char*)rbuf, rcount * root,
                             rdtype);
        if (MPI_SUCCESS != err) {
          line = __LINE__;
          goto err_hndl;
        }

        smpi_free_tmp_buffer(tempbuf);
      }
    } else if (!(vrank % 2)) {
      /* other non-leaf nodes */
      smpi_free_tmp_buffer(tempbuf);
    }
    ompi_coll_tuned_topo_destroy_tree(&bmtree);
    return MPI_SUCCESS;

 err_hndl:
    if (NULL != tempbuf)
      smpi_free_tmp_buffer(tempbuf);

    XBT_DEBUG("%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank);
    return err;
}
Example #8
0
/*
 *  gather_intra_linear_sync
 *
 *  Function:  - synchronized gather operation with
 *  Accepts:  - same arguments as MPI_Gather(), first segment size
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount,
                                         MPI_Datatype sdtype,
                                         void *rbuf, int rcount,
                                         MPI_Datatype rdtype,
                                         int root,
                                         MPI_Comm comm)
{
    int i;
    int ret, line;
    int rank, size;
    int first_segment_count;
    size_t typelng;
    MPI_Aint extent;
    MPI_Aint lb;

    int first_segment_size=0;
    size = comm->size();
    rank = comm->rank();

    size_t dsize, block_size;
    if (rank == root) {
        dsize= rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize=sdtype->size();
        block_size = dsize * scount;
    }

     if (block_size > 92160){
     first_segment_size = 32768;
     }else{
     first_segment_size = 1024;
     }

     XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);

     if (rank != root) {
       /* Non-root processes:
          - receive zero byte message from the root,
          - send the first segment of the data synchronously,
          - send the second segment of the data.
       */

       typelng = sdtype->size();
       sdtype->extent(&lb, &extent);
       first_segment_count = scount;
       COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count);

       Request::recv(sbuf, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE);

       Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm);

       Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root,
                     COLL_TAG_GATHER, comm);
    }

    else {
      /* Root process,
         - For every non-root node:
   - post irecv for the first segment of the message
   - send zero byte message to signal node to send the message
   - post irecv for the second segment of the message
   - wait for the first segment to complete
         - Copy local data if necessary
         - Waitall for all the second segments to complete.
*/
      char* ptmp;
      MPI_Request first_segment_req;
      MPI_Request* reqs = new (std::nothrow) MPI_Request[size];
      if (NULL == reqs) {
        ret  = -1;
        line = __LINE__;
        goto error_hndl; }

        typelng=rdtype->size();
        rdtype->extent(&lb, &extent);
        first_segment_count = rcount;
        COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
                                      first_segment_count );

        for (i = 0; i < size; ++i) {
            if (i == rank) {
                /* skip myself */
                reqs[i] = MPI_REQUEST_NULL;
                continue;
            }

            /* irecv for the first segment from i */
            ptmp = (char*)rbuf + i * rcount * extent;
            first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i,
                                     COLL_TAG_GATHER, comm
                                     );

            /* send sync message */
            Request::send(rbuf, 0, MPI_BYTE, i,
                                    COLL_TAG_GATHER,
                                     comm);

            /* irecv for the second segment */
            ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
            reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count),
                                     rdtype, i, COLL_TAG_GATHER, comm
                                     );

            /* wait on the first segment to complete */
            Request::wait(&first_segment_req, MPI_STATUS_IGNORE);
        }

        /* copy local data if necessary */
        if (MPI_IN_PLACE != sbuf) {
            ret = Datatype::copy(sbuf, scount, sdtype,
                                  (char*)rbuf + rank * rcount * extent,
                                  rcount, rdtype);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* wait all second segments to complete */
        ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE);
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        delete[] reqs;
    }

    /* All done */

    return MPI_SUCCESS;
 error_hndl:
    XBT_DEBUG(
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret );
    return ret;
}
int
Coll_allgatherv_ompi_neighborexchange::allgatherv(const void *sbuf, int scount,
                                                  MPI_Datatype sdtype,
                                                  void* rbuf, const int *rcounts, const int *rdispls,
                                                  MPI_Datatype rdtype,
                                                  MPI_Comm comm)
{
    int line = -1;
    int rank, size;
    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;

    int i, even_rank;
    int err = 0;
    ptrdiff_t slb, rlb, sext, rext;
    char *tmpsend = NULL, *tmprecv = NULL;


    size = comm->size();
    rank = comm->rank();

    if (size % 2) {
        XBT_DEBUG(
                     "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm",
                     size);
        return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
                                                     rbuf, rcounts,
                                                     rdispls, rdtype,
                                                     comm);
    }

    XBT_DEBUG(
                 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);

    err = sdtype->extent(&slb, &sext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    err = rdtype->extent(&rlb, &rext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    /* Initialization step:
       - if send buffer is not MPI_IN_PLACE, copy send buffer to
       the appropriate block of receive buffer
    */
    tmprecv = (char*) rbuf + rdispls[rank] * rext;
    if (MPI_IN_PLACE != sbuf) {
        tmpsend = (char*) sbuf;
        err = Datatype::copy(tmpsend, scount, sdtype,
                              tmprecv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
    }

    /* Determine neighbors, order in which blocks will arrive, etc. */
    even_rank = !(rank % 2);
    if (even_rank) {
        neighbor[0] = (rank + 1) % size;
        neighbor[1] = (rank - 1 + size) % size;
        recv_data_from[0] = rank;
        recv_data_from[1] = rank;
        offset_at_step[0] = (+2);
        offset_at_step[1] = (-2);
    } else {
        neighbor[0] = (rank - 1 + size) % size;
        neighbor[1] = (rank + 1) % size;
        recv_data_from[0] = neighbor[0];
        recv_data_from[1] = neighbor[0];
        offset_at_step[0] = (-2);
        offset_at_step[1] = (+2);
    }

    /* Communication loop:
       - First step is special: exchange a single block with neighbor[0].
       - Rest of the steps:
       update recv_data_from according to offset, and
       exchange two blocks with appropriate neighbor.
       the send location becomes previous receve location.
       Note, we need to create indexed datatype to send and receive these
       blocks properly.
    */
    tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
    tmpsend = (char*)rbuf + rdispls[rank] * rext;
    Request::sendrecv(tmpsend, rcounts[rank], rdtype,
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   tmprecv, rcounts[neighbor[0]], rdtype,
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   comm, MPI_STATUS_IGNORE);





    /* Determine initial sending counts and displacements*/
    if (even_rank) {
        send_data_from = rank;
    } else {
        send_data_from = recv_data_from[0];
    }

    for (i = 1; i < (size / 2); i++) {
        MPI_Datatype new_rdtype, new_sdtype;
        int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
        const int i_parity = i % 2;
        recv_data_from[i_parity] =
            (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

        /* Create new indexed types for sending and receiving.
           We are sending data from ranks (send_data_from) and (send_data_from+1)
           We are receiving data from ranks (recv_data_from[i_parity]) and
           (recv_data_from[i_parity]+1).
        */

        new_scounts[0] = rcounts[send_data_from];
        new_scounts[1] = rcounts[(send_data_from + 1)];
        new_sdispls[0] = rdispls[send_data_from];
        new_sdispls[1] = rdispls[(send_data_from + 1)];
        err = Datatype::create_indexed(2, new_scounts, new_sdispls, rdtype,
                                      &new_sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        new_sdtype->commit();

        new_rcounts[0] = rcounts[recv_data_from[i_parity]];
        new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
        new_rdispls[0] = rdispls[recv_data_from[i_parity]];
        new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
        err = Datatype::create_indexed(2, new_rcounts, new_rdispls, rdtype,
                                      &new_rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        new_rdtype->commit();

        tmprecv = (char*)rbuf;
        tmpsend = (char*)rbuf;

        /* Sendreceive */
        Request::sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       tmprecv, 1, new_rdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       comm, MPI_STATUS_IGNORE);

        send_data_from = recv_data_from[i_parity];

        Datatype::unref(new_sdtype);
        Datatype::unref(new_rdtype);
    }

    return MPI_SUCCESS;

 err_hndl:
    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank);
    return err;
}