예제 #1
0
int Coll_gather_ompi::gather(const void *sbuf, int scount,
                                           MPI_Datatype sdtype,
                                           void* rbuf, int rcount,
                                           MPI_Datatype rdtype,
                                           int root,
                                           MPI_Comm  comm
                                           )
{
    //const int large_segment_size = 32768;
    //const int small_segment_size = 1024;

    //const size_t large_block_size = 92160;
    const size_t intermediate_block_size = 6000;
    const size_t small_block_size = 1024;

    const int large_communicator_size = 60;
    const int small_communicator_size = 10;

    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("smpi_coll_tuned_gather_ompi");

    communicator_size = comm->size();
    rank = comm->rank();

    // Determine block size
    if (rank == root) {
        dsize = rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize = sdtype->size();
        block_size = dsize * scount;
    }

/*    if (block_size > large_block_size) {*/
/*        return smpi_coll_tuned_gather_ompi_linear_sync (sbuf, scount, sdtype, */
/*                                                         rbuf, rcount, rdtype, */
/*                                                         root, comm);*/

/*    } else*/ if (block_size > intermediate_block_size) {
        return Coll_gather_ompi_linear_sync::gather (sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         root, comm);

    } else if ((communicator_size > large_communicator_size) ||
               ((communicator_size > small_communicator_size) &&
                (block_size < small_block_size))) {
        return Coll_gather_ompi_binomial::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);

    }
    // Otherwise, use basic linear
    return Coll_gather_ompi_basic_linear::gather (sbuf, scount, sdtype,
                                                      rbuf, rcount, rdtype,
                                                      root, comm);
}
예제 #2
0
int PMPI_Ssend(const void* buf, int count, MPI_Datatype datatype, int dst, int tag, MPI_Comm comm) {
  int retval = 0;

  smpi_bench_end();

  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (dst == MPI_PROC_NULL) {
    retval = MPI_SUCCESS;
  } else if (dst >= comm->group()->size() || dst <0){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id         = simgrid::s4u::this_actor::get_pid();
    int dst_traced         = getPid(comm, dst);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("Ssend", dst,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));
    TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, tag, count * datatype->size());

    simgrid::smpi::Request::ssend(buf, count, datatype, dst, tag, comm);
    retval = MPI_SUCCESS;

    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
예제 #3
0
int PMPI_Sendrecv(const void* sendbuf, int sendcount, MPI_Datatype sendtype, int dst, int sendtag, void* recvbuf,
                  int recvcount, MPI_Datatype recvtype, int src, int recvtag, MPI_Comm comm, MPI_Status* status)
{
  int retval = 0;

  smpi_bench_end();

  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (not sendtype->is_valid() || not recvtype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if (src == MPI_PROC_NULL) {
    if(status!=MPI_STATUS_IGNORE){
      simgrid::smpi::Status::empty(status);
      status->MPI_SOURCE = MPI_PROC_NULL;
    }
    if(dst != MPI_PROC_NULL)
      simgrid::smpi::Request::send(sendbuf, sendcount, sendtype, dst, sendtag, comm);
    retval = MPI_SUCCESS;
  }else if (dst == MPI_PROC_NULL){
    simgrid::smpi::Request::recv(recvbuf, recvcount, recvtype, src, recvtag, comm, status);
    retval = MPI_SUCCESS;
  }else if (dst >= comm->group()->size() || dst <0 ||
      (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0))){
    retval = MPI_ERR_RANK;
  } else if ((sendcount < 0 || recvcount<0) ||
      (sendbuf==nullptr && sendcount > 0) || (recvbuf==nullptr && recvcount>0)) {
    retval = MPI_ERR_COUNT;
  } else if((sendtag<0 && sendtag !=  MPI_ANY_TAG)||(recvtag<0 && recvtag != MPI_ANY_TAG)){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id         = simgrid::s4u::this_actor::get_pid();
    int dst_traced         = getPid(comm, dst);
    int src_traced         = getPid(comm, src);

    // FIXME: Hack the way to trace this one
    std::vector<int>* dst_hack = new std::vector<int>;
    std::vector<int>* src_hack = new std::vector<int>;
    dst_hack->push_back(dst_traced);
    src_hack->push_back(src_traced);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::VarCollTIData(
                           "sendRecv", -1, sendtype->is_replayable() ? sendcount : sendcount * sendtype->size(),
                           dst_hack, recvtype->is_replayable() ? recvcount : recvcount * recvtype->size(), src_hack,
                           simgrid::smpi::Datatype::encode(sendtype), simgrid::smpi::Datatype::encode(recvtype)));

    TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, sendtag, sendcount * sendtype->size());

    simgrid::smpi::Request::sendrecv(sendbuf, sendcount, sendtype, dst, sendtag, recvbuf, recvcount, recvtype, src,
                                     recvtag, comm, status);
    retval = MPI_SUCCESS;

    TRACE_smpi_recv(src_traced, my_proc_id, recvtag);
    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
예제 #4
0
int PMPI_Type_size_x(MPI_Datatype datatype, MPI_Count *size)
{
  if (datatype == MPI_DATATYPE_NULL) {
    return MPI_ERR_TYPE;
  } else if (size == nullptr) {
    return MPI_ERR_ARG;
  } else {
    *size = static_cast<MPI_Count>(datatype->size());
    return MPI_SUCCESS;
  }
}
예제 #5
0
int PMPI_Pack_size(int incount, MPI_Datatype datatype, MPI_Comm comm, int* size) {
  if(incount<0){
    return MPI_ERR_COUNT;
  } else if (datatype == MPI_DATATYPE_NULL || not datatype->is_valid()){
    return MPI_ERR_TYPE;
  } else if(comm==MPI_COMM_NULL){
    return MPI_ERR_COMM;
  } else {
    *size=incount*datatype->size();
    return MPI_SUCCESS;
  }
}
예제 #6
0
int Coll_reduce_scatter_ompi::reduce_scatter(const void *sbuf, void *rbuf,
                                                    const int *rcounts,
                                                    MPI_Datatype dtype,
                                                    MPI_Op  op,
                                                    MPI_Comm  comm
                                                    )
{
    int comm_size, i, pow2;
    size_t total_message_size, dsize;
    const double a = 0.0012;
    const double b = 8.0;
    const size_t small_message_size = 12 * 1024;
    const size_t large_message_size = 256 * 1024;
    int zerocounts = 0;

    XBT_DEBUG("Coll_reduce_scatter_ompi::reduce_scatter");

    comm_size = comm->size();
    // We need data size for decision function
    dsize=dtype->size();
    total_message_size = 0;
    for (i = 0; i < comm_size; i++) {
        total_message_size += rcounts[i];
        if (0 == rcounts[i]) {
            zerocounts = 1;
        }
    }

    if (((op != MPI_OP_NULL) && not op->is_commutative()) || (zerocounts)) {
      Coll_reduce_scatter_default::reduce_scatter(sbuf, rbuf, rcounts, dtype, op, comm);
      return MPI_SUCCESS;
    }

    total_message_size *= dsize;

    // compute the nearest power of 2
    for (pow2 = 1; pow2 < comm_size; pow2 <<= 1);

    if ((total_message_size <= small_message_size) ||
        ((total_message_size <= large_message_size) && (pow2 == comm_size)) ||
        (comm_size >= a * total_message_size + b)) {
        return
            Coll_reduce_scatter_ompi_basic_recursivehalving::reduce_scatter(sbuf, rbuf, rcounts,
                                                                        dtype, op,
                                                                        comm);
    }
    return Coll_reduce_scatter_ompi_ring::reduce_scatter(sbuf, rbuf, rcounts,
                                                     dtype, op,
                                                     comm);



}
예제 #7
0
int Coll_scatter_ompi::scatter(const void *sbuf, int scount,
                                            MPI_Datatype sdtype,
                                            void* rbuf, int rcount,
                                            MPI_Datatype rdtype,
                                            int root, MPI_Comm  comm
                                            )
{
    const size_t small_block_size = 300;
    const int small_comm_size = 10;
    int communicator_size, rank;
    size_t dsize, block_size;

    XBT_DEBUG("Coll_scatter_ompi::scatter");

    communicator_size = comm->size();
    rank = comm->rank();
    // Determine block size
    if (root == rank) {
        dsize=sdtype->size();
        block_size = dsize * scount;
    } else {
        dsize=rdtype->size();
        block_size = dsize * rcount;
    }

    if ((communicator_size > small_comm_size) &&
        (block_size < small_block_size)) {
      std::unique_ptr<unsigned char[]> tmp_buf;
      if (rank != root) {
        tmp_buf.reset(new unsigned char[rcount * rdtype->get_extent()]);
        sbuf   = tmp_buf.get();
        scount = rcount;
        sdtype = rdtype;
      }
      return Coll_scatter_ompi_binomial::scatter(sbuf, scount, sdtype, rbuf, rcount, rdtype, root, comm);
    }
    return Coll_scatter_ompi_basic_linear::scatter (sbuf, scount, sdtype,
                                                       rbuf, rcount, rdtype,
                                                       root, comm);
}
예제 #8
0
int Coll_allgatherv_ompi::allgatherv(const void *sbuf, int scount,
                                               MPI_Datatype sdtype,
                                               void* rbuf, const int *rcounts,
                                               const int *rdispls,
                                               MPI_Datatype rdtype,
                                               MPI_Comm  comm
                                               )
{
    int i;
    int communicator_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgatherv_pair::allgatherv(sbuf, scount, sdtype,
                                                           rbuf, rcounts, rdispls, rdtype,
                                                           comm);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = 0;
    for (i = 0; i < communicator_size; i++) {
        total_dsize += dsize * rcounts[i];
    }

    /* Decision based on allgather decision.   */
    if (total_dsize < 50000) {
        return Coll_allgatherv_ompi_bruck::allgatherv(sbuf, scount, sdtype,
                                                      rbuf, rcounts, rdispls, rdtype,
                                                      comm);

    } else {
        if (communicator_size % 2) {
            return Coll_allgatherv_ring::allgatherv(sbuf, scount, sdtype,
                                                         rbuf, rcounts, rdispls, rdtype,
                                                         comm);
        } else {
            return  Coll_allgatherv_ompi_neighborexchange::allgatherv(sbuf, scount, sdtype,
                                                                      rbuf, rcounts, rdispls, rdtype,
                                                                      comm);
        }
    }
}
예제 #9
0
int PMPI_Rput(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank,
              MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Win win, MPI_Request* request){
  int retval = 0;
  smpi_bench_end();
  if (win == MPI_WIN_NULL) {
    retval = MPI_ERR_WIN;
  } else if (target_rank == MPI_PROC_NULL) {
    *request = MPI_REQUEST_NULL;
    retval = MPI_SUCCESS;
  } else if (target_rank <0){
    retval = MPI_ERR_RANK;
  } else if (win->dynamic()==0 && target_disp <0){
    //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address
    retval = MPI_ERR_ARG;
  } else if ((origin_count < 0 || target_count < 0) ||
            (origin_addr==nullptr && origin_count > 0)){
    retval = MPI_ERR_COUNT;
  } else if (((origin_datatype == MPI_DATATYPE_NULL) || (target_datatype == MPI_DATATYPE_NULL)) ||
            ((not origin_datatype->is_valid()) || (not target_datatype->is_valid()))) {
    retval = MPI_ERR_TYPE;
  } else if(request == nullptr){
    retval = MPI_ERR_REQUEST;
  } else {
    int my_proc_id = simgrid::s4u::this_actor::get_pid();
    MPI_Group group;
    win->get_group(&group);
    int dst_traced = group->actor(target_rank)->get_pid();
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData(
                           "Rput", target_rank,
                           origin_datatype->is_replayable() ? origin_count : origin_count * origin_datatype->size(),
                           simgrid::smpi::Datatype::encode(origin_datatype)));
    TRACE_smpi_send(my_proc_id, my_proc_id, dst_traced, SMPI_RMA_TAG, origin_count * origin_datatype->size());

    retval = win->put( origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count,
                           target_datatype, request);

    TRACE_smpi_comm_out(my_proc_id);
  }
  smpi_bench_begin();
  return retval;
}
예제 #10
0
int Coll_reduce_mvapich2::reduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, int root, MPI_Comm comm)
{
  if(mv2_reduce_thresholds_table == NULL)
    init_mv2_reduce_tables_stampede();

  int mpi_errno = MPI_SUCCESS;
  int range = 0;
  int range_threshold = 0;
  int range_intra_threshold = 0;
  int is_commutative, pof2;
  int comm_size = 0;
  long nbytes = 0;
  int sendtype_size;
  int is_two_level = 0;

  comm_size = comm->size();
  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  if (count == 0)
    return MPI_SUCCESS;

  is_commutative = (op==MPI_OP_NULL || op->is_commutative());

  /* find nearest power-of-two less than or equal to comm_size */
  for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
  pof2 >>=1;


  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_reduce_tuning_table - 1)) &&
      (comm_size > mv2_reduce_thresholds_table[range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_reduce_thresholds_table[range].size_inter_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max)
  && (mv2_reduce_thresholds_table[range].inter_leader[range_threshold].max !=
      -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra node function */
  while ((range_intra_threshold < (mv2_reduce_thresholds_table[range].size_intra_table - 1))
      && (nbytes >
  mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max)
  && (mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].max !=
      -1)) {
      range_intra_threshold++;
  }

  /* Set intra-node function pt for reduce_two_level */
  MV2_Reduce_intra_function =
      mv2_reduce_thresholds_table[range].intra_node[range_intra_threshold].
      MV2_pt_Reduce_function;
  /* Set inter-leader pt */
  MV2_Reduce_function =
      mv2_reduce_thresholds_table[range].inter_leader[range_threshold].
      MV2_pt_Reduce_function;

  if(mv2_reduce_intra_knomial_factor<0)
    {
      mv2_reduce_intra_knomial_factor = mv2_reduce_thresholds_table[range].intra_k_degree;
    }
  if(mv2_reduce_inter_knomial_factor<0)
    {
      mv2_reduce_inter_knomial_factor = mv2_reduce_thresholds_table[range].inter_k_degree;
    }
  if(mv2_reduce_thresholds_table[range].is_two_level_reduce[range_threshold] == 1){
      is_two_level = 1;
  }
  /* We call Reduce function */
  if(is_two_level == 1)
    {
       if (is_commutative == 1) {
         if(comm->get_leaders_comm()==MPI_COMM_NULL){
           comm->init_smp();
         }
         mpi_errno = MPIR_Reduce_two_level_helper_MV2(sendbuf, recvbuf, count,
                                           datatype, op, root, comm);
        } else {
      mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
          datatype, op, root, comm);
      }
    } else if(MV2_Reduce_function == &MPIR_Reduce_inter_knomial_wrapper_MV2 ){
        if(is_commutative ==1)
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else if(MV2_Reduce_function == &MPIR_Reduce_redscat_gather_MV2){
        if (/*(HANDLE_GET_KIND(op) == HANDLE_KIND_BUILTIN) &&*/ (count >= pof2))
          {
            mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
                datatype, op, root, comm);
          } else {
              mpi_errno = MPIR_Reduce_binomial_MV2(sendbuf, recvbuf, count,
                  datatype, op, root, comm);
          }
    } else {
        mpi_errno = MV2_Reduce_function(sendbuf, recvbuf, count,
            datatype, op, root, comm);
    }


  return mpi_errno;

}
예제 #11
0
/*
 *  gather_intra_linear_sync
 *
 *  Function:  - synchronized gather operation with
 *  Accepts:  - same arguments as MPI_Gather(), first segment size
 *  Returns:  - MPI_SUCCESS or error code
 */
int Coll_gather_ompi_linear_sync::gather(void *sbuf, int scount,
                                         MPI_Datatype sdtype,
                                         void *rbuf, int rcount,
                                         MPI_Datatype rdtype,
                                         int root,
                                         MPI_Comm comm)
{
    int i;
    int ret, line;
    int rank, size;
    int first_segment_count;
    size_t typelng;
    MPI_Aint extent;
    MPI_Aint lb;

    int first_segment_size=0;
    size = comm->size();
    rank = comm->rank();

    size_t dsize, block_size;
    if (rank == root) {
        dsize= rdtype->size();
        block_size = dsize * rcount;
    } else {
        dsize=sdtype->size();
        block_size = dsize * scount;
    }

     if (block_size > 92160){
     first_segment_size = 32768;
     }else{
     first_segment_size = 1024;
     }

     XBT_DEBUG("smpi_coll_tuned_gather_ompi_linear_sync rank %d, segment %d", rank, first_segment_size);

     if (rank != root) {
       /* Non-root processes:
          - receive zero byte message from the root,
          - send the first segment of the data synchronously,
          - send the second segment of the data.
       */

       typelng = sdtype->size();
       sdtype->extent(&lb, &extent);
       first_segment_count = scount;
       COLL_TUNED_COMPUTED_SEGCOUNT((size_t)first_segment_size, typelng, first_segment_count);

       Request::recv(sbuf, 0, MPI_BYTE, root, COLL_TAG_GATHER, comm, MPI_STATUS_IGNORE);

       Request::send(sbuf, first_segment_count, sdtype, root, COLL_TAG_GATHER, comm);

       Request::send((char*)sbuf + extent * first_segment_count, (scount - first_segment_count), sdtype, root,
                     COLL_TAG_GATHER, comm);
    }

    else {
      /* Root process,
         - For every non-root node:
   - post irecv for the first segment of the message
   - send zero byte message to signal node to send the message
   - post irecv for the second segment of the message
   - wait for the first segment to complete
         - Copy local data if necessary
         - Waitall for all the second segments to complete.
*/
      char* ptmp;
      MPI_Request first_segment_req;
      MPI_Request* reqs = new (std::nothrow) MPI_Request[size];
      if (NULL == reqs) {
        ret  = -1;
        line = __LINE__;
        goto error_hndl; }

        typelng=rdtype->size();
        rdtype->extent(&lb, &extent);
        first_segment_count = rcount;
        COLL_TUNED_COMPUTED_SEGCOUNT( (size_t)first_segment_size, typelng,
                                      first_segment_count );

        for (i = 0; i < size; ++i) {
            if (i == rank) {
                /* skip myself */
                reqs[i] = MPI_REQUEST_NULL;
                continue;
            }

            /* irecv for the first segment from i */
            ptmp = (char*)rbuf + i * rcount * extent;
            first_segment_req = Request::irecv(ptmp, first_segment_count, rdtype, i,
                                     COLL_TAG_GATHER, comm
                                     );

            /* send sync message */
            Request::send(rbuf, 0, MPI_BYTE, i,
                                    COLL_TAG_GATHER,
                                     comm);

            /* irecv for the second segment */
            ptmp = (char*)rbuf + (i * rcount + first_segment_count) * extent;
            reqs[i]=Request::irecv(ptmp, (rcount - first_segment_count),
                                     rdtype, i, COLL_TAG_GATHER, comm
                                     );

            /* wait on the first segment to complete */
            Request::wait(&first_segment_req, MPI_STATUS_IGNORE);
        }

        /* copy local data if necessary */
        if (MPI_IN_PLACE != sbuf) {
            ret = Datatype::copy(sbuf, scount, sdtype,
                                  (char*)rbuf + rank * rcount * extent,
                                  rcount, rdtype);
            if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }
        }

        /* wait all second segments to complete */
        ret = Request::waitall(size, reqs, MPI_STATUSES_IGNORE);
        if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; }

        delete[] reqs;
    }

    /* All done */

    return MPI_SUCCESS;
 error_hndl:
    XBT_DEBUG(
                   "ERROR_HNDL: node %d file %s line %d error %d\n",
                   rank, __FILE__, line, ret );
    return ret;
}
예제 #12
0
int Coll_allgather_ompi::allgather(const void *sbuf, int scount,
                                              MPI_Datatype sdtype,
                                              void* rbuf, int rcount,
                                              MPI_Datatype rdtype,
                                              MPI_Comm  comm
                                              )
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = comm->size();

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return Coll_allgather_pair::allgather (sbuf, scount, sdtype,
                                                          rbuf, rcount, rdtype,
                                                          comm/*, module*/);
    }

    /* Determine complete data size */
    dsize=sdtype->size();
    total_dsize = dsize * scount * communicator_size;

    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1);

    /* Decision based on MX 2Gb results from Grig cluster at
       The University of Tennesse, Knoxville
       - if total message size is less than 50KB use either bruck or
       recursive doubling for non-power of two and power of two nodes,
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        } else {
            return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                         rbuf, rcount, rdtype,
                                                         comm);
        }
    } else {
        if (communicator_size % 2) {
            return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                        rbuf, rcount, rdtype,
                                                        comm);
        } else {
            return  Coll_allgather_ompi_neighborexchange::allgather(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        }
    }

#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2
       presented in Thakur et.al. "Optimization of Collective Communication
       Operations in MPICH", International Journal of High Performance Computing
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return Coll_allgather_rdb::allgather(sbuf, scount, sdtype,
                                                                 rbuf, rcount, rdtype,
                                                                 comm);
    } else if (total_dsize <= 81920) {
        return Coll_allgather_bruck::allgather(sbuf, scount, sdtype,
                                                     rbuf, rcount, rdtype,
                                                     comm);
    }
    return Coll_allgather_ring::allgather(sbuf, scount, sdtype,
                                                rbuf, rcount, rdtype,
                                                comm);
#endif  /* defined(USE_MPICH2_DECISION) */
}
예제 #13
0
int Coll_bcast_mvapich2_intra_node::bcast(void *buffer,
                         int count,
                         MPI_Datatype datatype,
                         int root, MPI_Comm  comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size;
    int two_level_bcast = 1;
    size_t nbytes = 0;
    int is_homogeneous, is_contig;
    MPI_Aint type_size;
    unsigned char* tmp_buf = nullptr;
    MPI_Comm shmem_comm;

    if (count == 0)
        return MPI_SUCCESS;
    if (MV2_Bcast_function==NULL){
      MV2_Bcast_function=Coll_bcast_mpich::bcast;
    }

    if (MV2_Bcast_intra_node_function==NULL){
      MV2_Bcast_intra_node_function= Coll_bcast_mpich::bcast;
    }

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }

    comm_size = comm->size();
   // rank = comm->rank();
/*
    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
        is_contig = 1;
/*    else {
        MPID_Datatype_get_ptr(datatype, dtp);
        is_contig = dtp->is_contig;
    }
*/
    is_homogeneous = 1;
#ifdef MPID_HAS_HETERO
    if (comm_ptr->is_hetero)
        is_homogeneous = 0;
#endif

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();
    //}
/*    else {*/
/*        MPIR_Pack_size_impl(1, datatype, &type_size);*/
/*    }*/
    nbytes = (size_t) (count) * (type_size);
    if (comm_size <= mv2_bcast_two_level_system_size) {
        if (nbytes > mv2_bcast_short_msg && nbytes < mv2_bcast_large_msg) {
            two_level_bcast = 1;
        } else {
            two_level_bcast = 0;
        }
    }

    if (two_level_bcast == 1
#if defined(_MCST_SUPPORT_)
            || comm_ptr->ch.is_mcast_ok
#endif
        ) {

      if (not is_contig || not is_homogeneous) {
        tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

        /* TODO: Pipeline the packing and communication */
        // position = 0;
        /*            if (rank == root) {*/
        /*                mpi_errno =*/
        /*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
        /*                if (mpi_errno)*/
        /*                    MPIU_ERR_POP(mpi_errno);*/
        /*            }*/
        }

        shmem_comm = comm->get_intra_comm();
        if (not is_contig || not is_homogeneous) {
          mpi_errno = MPIR_Bcast_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
        } else {
            mpi_errno =
                MPIR_Bcast_inter_node_helper_MV2(buffer, count, datatype, root,
                                                 comm);
        }

        /* We are now done with the inter-node phase */
            if (nbytes <= mv2_knomial_intra_node_threshold) {
              if (not is_contig || not is_homogeneous) {
                mpi_errno = MPIR_Shmem_Bcast_MV2(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                } else {
                    mpi_errno = MPIR_Shmem_Bcast_MV2(buffer, count, datatype,
                                                     root, shmem_comm);
                }
            } else {
              if (not is_contig || not is_homogeneous) {
                mpi_errno = MPIR_Knomial_Bcast_intra_node_MV2(tmp_buf, nbytes, MPI_BYTE, INTRA_NODE_ROOT, shmem_comm);
                } else {
                    mpi_errno =
                        MPIR_Knomial_Bcast_intra_node_MV2(buffer, count,
                                                          datatype,
                                                          INTRA_NODE_ROOT,
                                                          shmem_comm);
                }
            }

    } else {
        if (nbytes <= mv2_bcast_short_msg) {
            mpi_errno = MPIR_Bcast_binomial_MV2(buffer, count, datatype, root,
                                                comm);
        } else {
            if (mv2_scatter_rd_inter_leader_bcast) {
                mpi_errno = MPIR_Bcast_scatter_ring_allgather_MV2(buffer, count,
                                                                  datatype,
                                                                  root,
                                                                  comm);
            } else {
                mpi_errno =
                    MPIR_Bcast_scatter_doubling_allgather_MV2(buffer, count,
                                                              datatype, root,
                                                              comm);
            }
        }
    }


    return mpi_errno;

}
예제 #14
0
int Coll_reduce_ompi::reduce(const void *sendbuf, void *recvbuf,
                                            int count, MPI_Datatype  datatype,
                                            MPI_Op   op, int root,
                                            MPI_Comm   comm
                                            )
{
    int communicator_size=0;
    //int segsize = 0;
    size_t message_size, dsize;
    const double a1 =  0.6016 / 1024.0; /* [1/B] */
    const double b1 =  1.3496;
    const double a2 =  0.0410 / 1024.0; /* [1/B] */
    const double b2 =  9.7128;
    const double a3 =  0.0422 / 1024.0; /* [1/B] */
    const double b3 =  1.1614;
    //const double a4 =  0.0033 / 1024.0;  [1/B]
    //const double b4 =  1.6761;

    /* no limit on # of outstanding requests */
    //const int max_requests = 0;

    communicator_size = comm->size();

    /* need data size for decision function */
    dsize=datatype->size();
    message_size = dsize * count;   /* needed for decision */

    /**
     * If the operation is non commutative we currently have choice of linear
     * or in-order binary tree algorithm.
     */
    if ((op != MPI_OP_NULL) && not op->is_commutative()) {
      if ((communicator_size < 12) && (message_size < 2048)) {
        return Coll_reduce_ompi_basic_linear::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module*/);
      }
      return Coll_reduce_ompi_in_order_binary::reduce(sendbuf, recvbuf, count, datatype, op, root, comm /*, module,
                                                             0, max_requests*/);
    }

    if ((communicator_size < 8) && (message_size < 512)){
        /* Linear_0K */
        return Coll_reduce_ompi_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm);
    } else if (((communicator_size < 8) && (message_size < 20480)) ||
               (message_size < 2048) || (count <= 1)) {
        /* Binomial_0K */
        //segsize = 0;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a1 * message_size + b1)) {
        // Binomial_1K
        //segsize = 1024;
        return Coll_reduce_ompi_binomial::reduce(sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                     segsize, max_requests*/);
    } else if (communicator_size > (a2 * message_size + b2)) {
        // Pipeline_1K
        //segsize = 1024;
        return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                      segsize, max_requests*/);
    } else if (communicator_size > (a3 * message_size + b3)) {
        // Binary_32K
        //segsize = 32*1024;
        return Coll_reduce_ompi_binary::reduce( sendbuf, recvbuf, count, datatype, op, root,
                                                    comm/*, module, segsize, max_requests*/);
    }
//    if (communicator_size > (a4 * message_size + b4)) {
        // Pipeline_32K
//        segsize = 32*1024;
//    } else {
        // Pipeline_64K
//        segsize = 64*1024;
//    }
    return Coll_reduce_ompi_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm/*, module,
                                                  segsize, max_requests*/);

#if 0
    /* for small messages use linear algorithm */
    if (message_size <= 4096) {
        segsize = 0;
        fanout = communicator_size - 1;
        /* when linear implemented or taken from basic put here, right now using chain as a linear system */
        /* it is implemented and I shouldn't be calling a chain with a fanout bigger than MAXTREEFANOUT from topo.h! */
        return Coll_reduce_intra_basic_linear::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module);
        /*        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, segsize, fanout); */
    }
    if (message_size < 524288) {
        if (message_size <= 65536 ) {
            segsize = 32768;
            fanout = 8;
        } else {
            segsize = 1024;
            fanout = communicator_size/2;
        }
        /* later swap this for a binary tree */
        /*         fanout = 2; */
        return Coll_reduce_intra_chain::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                   segsize, fanout, max_requests);
    }
    segsize = 1024;
    return Coll_reduce_intra_pipeline::reduce (sendbuf, recvbuf, count, datatype, op, root, comm, module,
                                                  segsize, max_requests);
#endif  /* 0 */
}
예제 #15
0
int Coll_bcast_ompi::bcast(void *buff, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm  comm
                                          )
{
    /* Decision function based on MX results for
       messages up to 36MB and communicator sizes up to 64 nodes */
    const size_t small_message_size = 2048;
    const size_t intermediate_message_size = 370728;
    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;

    int communicator_size;
    //int segsize = 0;
    size_t message_size, dsize;

    communicator_size = comm->size();

    /* else we need data size for decision function */
    dsize = datatype->size();
    message_size = dsize * (unsigned long)count;   /* needed for decision */

    /* Handle messages of small and intermediate size, and
       single-element broadcasts */
    if ((message_size < small_message_size) || (count <= 1)) {
        /* Binomial without segmentation */
        return  Coll_bcast_binomial_tree::bcast (buff, count, datatype,
                                                      root, comm);

    } else if (message_size < intermediate_message_size) {
        // SplittedBinary with 1KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    }
     //Handle large message sizes
    else if (communicator_size < (a_p128 * message_size + b_p128)) {
        //Pipeline with 128KB segments
        //segsize = 1024  << 7;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < 13) {
        // Split Binary with 8KB segments
        return Coll_bcast_ompi_split_bintree::bcast(buff, count, datatype,
                                                         root, comm);

    } else if (communicator_size < (a_p64 * message_size + b_p64)) {
        // Pipeline with 64KB segments
        //segsize = 1024 << 6;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    } else if (communicator_size < (a_p16 * message_size + b_p16)) {
        //Pipeline with 16KB segments
        //segsize = 1024 << 4;
        return Coll_bcast_ompi_pipeline::bcast (buff, count, datatype,
                                                     root, comm);


    }
    /* Pipeline with 8KB segments */
    //segsize = 1024 << 3;
    return Coll_bcast_flattree_pipeline::bcast (buff, count, datatype,
                                                 root, comm
                                                 /*segsize*/);
#if 0
    /* this is based on gige measurements */

    if (communicator_size  < 4) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size == 4) {
        if (message_size < 524288) segsize = 0;
        else segsize = 16384;
        return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (communicator_size <= 8 && message_size < 4096) {
        return Coll_bcast_intra_basic_linear::bcast (buff, count, datatype, root, comm, module);
    }
    if (communicator_size > 8 && message_size >= 32768 && message_size < 524288) {
        segsize = 16384;
        return  Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    if (message_size >= 524288) {
        segsize = 16384;
        return Coll_bcast_intra_pipeline::bcast (buff, count, datatype, root, comm, module, segsize);
    }
    segsize = 0;
    /* once tested can swap this back in */
    /* return Coll_bcast_intra_bmtree::bcast (buff, count, datatype, root, comm, segsize); */
    return Coll_bcast_intra_bintree::bcast (buff, count, datatype, root, comm, module, segsize);
#endif  /* 0 */
}
예제 #16
0
int PMPI_Irecv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Request * request)
{
  int retval = 0;

  smpi_bench_end();

  if (request == nullptr) {
    retval = MPI_ERR_ARG;
  } else if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (src == MPI_PROC_NULL) {
    *request = MPI_REQUEST_NULL;
    retval = MPI_SUCCESS;
  } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {

    int my_proc_id = simgrid::s4u::this_actor::get_pid();

    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("irecv", src,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));

    *request = simgrid::smpi::Request::irecv(buf, count, datatype, src, tag, comm);
    retval = MPI_SUCCESS;

    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  if (retval != MPI_SUCCESS && request != nullptr)
    *request = MPI_REQUEST_NULL;
  return retval;
}
예제 #17
0
int PMPI_Compare_and_swap(const void* origin_addr, void* compare_addr, void* result_addr, MPI_Datatype datatype,
                          int target_rank, MPI_Aint target_disp, MPI_Win win)
{
  int retval = 0;
  smpi_bench_end();
  if (win == MPI_WIN_NULL) {
    retval = MPI_ERR_WIN;
  } else if (target_rank == MPI_PROC_NULL) {
    retval = MPI_SUCCESS;
  } else if (target_rank <0){
    retval = MPI_ERR_RANK;
  } else if (win->dynamic()==0 && target_disp <0){
    //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address
    retval = MPI_ERR_ARG;
  } else if (origin_addr==nullptr || result_addr==nullptr || compare_addr==nullptr){
    retval = MPI_ERR_COUNT;
  } else if ((datatype == MPI_DATATYPE_NULL) || (not datatype->is_valid())) {
    retval = MPI_ERR_TYPE;
  } else {
    int my_proc_id = simgrid::s4u::this_actor::get_pid();
    MPI_Group group;
    win->get_group(&group);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("Compare_and_swap", target_rank,
                                                       datatype->is_replayable() ? 1 : datatype->size(),
                                                       simgrid::smpi::Datatype::encode(datatype)));

    retval = win->compare_and_swap(origin_addr, compare_addr, result_addr, datatype, target_rank, target_disp);

    TRACE_smpi_comm_out(my_proc_id);
  }
  smpi_bench_begin();
  return retval;
}
예제 #18
0
int Coll_bcast_mvapich2::bcast(void *buffer,
    int count,
    MPI_Datatype datatype,
    int root, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size/*, rank*/;
    int two_level_bcast = 1;
    long nbytes = 0;
    int range = 0;
    int range_threshold = 0;
    int range_threshold_intra = 0;
    // int is_homogeneous, is_contig;
    MPI_Aint type_size;
    //, position;
    // unsigned char *tmp_buf = NULL;
    MPI_Comm shmem_comm;
    //MPID_Datatype *dtp;

    if (count == 0)
        return MPI_SUCCESS;
    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    if (not mv2_bcast_thresholds_table)
      init_mv2_bcast_tables_stampede();
    comm_size = comm->size();
    //rank = comm->rank();

    //is_contig=1;
/*    if (HANDLE_GET_KIND(datatype) == HANDLE_KIND_BUILTIN)*/
/*        is_contig = 1;*/
/*    else {*/
/*        MPID_Datatype_get_ptr(datatype, dtp);*/
/*        is_contig = dtp->is_contig;*/
/*    }*/

    // is_homogeneous = 1;

    /* MPI_Type_size() might not give the accurate size of the packed
     * datatype for heterogeneous systems (because of padding, encoding,
     * etc). On the other hand, MPI_Pack_size() can become very
     * expensive, depending on the implementation, especially for
     * heterogeneous systems. We want to use MPI_Type_size() wherever
     * possible, and MPI_Pack_size() in other places.
     */
    //if (is_homogeneous) {
        type_size=datatype->size();

   /* } else {
        MPIR_Pack_size_impl(1, datatype, &type_size);
    }*/
    nbytes =  (count) * (type_size);

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_bcast_tuning_table - 1)) &&
           (comm_size > mv2_bcast_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    while ((range_threshold < (mv2_bcast_thresholds_table[range].size_inter_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max)
           && (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }

    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
            (mv2_bcast_thresholds_table[range].size_intra_table - 1))
           && (nbytes >
               mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max)
           && (mv2_bcast_thresholds_table[range].intra_node[range_threshold_intra].max !=
               -1)) {
        range_threshold_intra++;
    }

    MV2_Bcast_function =
        mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        MV2_pt_Bcast_function;

    MV2_Bcast_intra_node_function =
        mv2_bcast_thresholds_table[range].
        intra_node[range_threshold_intra].MV2_pt_Bcast_function;

/*    if (mv2_user_bcast_intra == NULL && */
/*            MV2_Bcast_intra_node_function == &MPIR_Knomial_Bcast_intra_node_MV2) {*/
/*            MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;*/
/*    }*/

    if (mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
        zcpy_pipelined_knomial_factor != -1) {
        zcpy_knomial_factor =
            mv2_bcast_thresholds_table[range].inter_leader[range_threshold].
            zcpy_pipelined_knomial_factor;
    }

    if (mv2_pipelined_zcpy_knomial_factor != -1) {
        zcpy_knomial_factor = mv2_pipelined_zcpy_knomial_factor;
    }

    if(MV2_Bcast_intra_node_function == NULL) {
        /* if tuning table do not have any intra selection, set func pointer to
        ** default one for mcast intra node */
        MV2_Bcast_intra_node_function = &MPIR_Shmem_Bcast_MV2;
    }

    /* Set value of pipeline segment size */
    bcast_segment_size = mv2_bcast_thresholds_table[range].bcast_segment_size;

    /* Set value of inter node knomial factor */
    mv2_inter_node_knomial_factor = mv2_bcast_thresholds_table[range].inter_node_knomial_factor;

    /* Set value of intra node knomial factor */
    mv2_intra_node_knomial_factor = mv2_bcast_thresholds_table[range].intra_node_knomial_factor;

    /* Check if we will use a two level algorithm or not */
    two_level_bcast =
#if defined(_MCST_SUPPORT_)
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold]
        || comm->ch.is_mcast_ok;
#else
        mv2_bcast_thresholds_table[range].is_two_level_bcast[range_threshold];
#endif
     if (two_level_bcast == 1) {
       // if (not is_contig || not is_homogeneous) {
//   tmp_buf = smpi_get_tmp_sendbuffer(nbytes);

/*            position = 0;*/
/*            if (rank == root) {*/
/*                mpi_errno =*/
/*                    MPIR_Pack_impl(buffer, count, datatype, tmp_buf, nbytes, &position);*/
/*                if (mpi_errno)*/
/*                    MPIU_ERR_POP(mpi_errno);*/
/*            }*/
// }
#ifdef CHANNEL_MRAIL_GEN2
        if ((mv2_enable_zcpy_bcast == 1) &&
              (&MPIR_Pipelined_Bcast_Zcpy_MV2 == MV2_Bcast_function)) {
          // if (not is_contig || not is_homogeneous) {
          //   mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
          // } else {
                mpi_errno = MPIR_Pipelined_Bcast_Zcpy_MV2(buffer, count, datatype,
                                                 root, comm);
          // }
        } else
#endif /* defined(CHANNEL_MRAIL_GEN2) */
        {
            shmem_comm = comm->get_intra_comm();
            // if (not is_contig || not is_homogeneous) {
            //   MPIR_Bcast_tune_inter_node_helper_MV2(tmp_buf, nbytes, MPI_BYTE, root, comm);
            // } else {
              MPIR_Bcast_tune_inter_node_helper_MV2(buffer, count, datatype, root, comm);
            // }

            /* We are now done with the inter-node phase */


                    root = INTRA_NODE_ROOT;

                    // if (not is_contig || not is_homogeneous) {
                    //       mpi_errno = MV2_Bcast_intra_node_function(tmp_buf, nbytes, MPI_BYTE, root, shmem_comm);
                    // } else {
                    mpi_errno = MV2_Bcast_intra_node_function(buffer, count,
                                                              datatype, root, shmem_comm);

                    // }
        }
        /*        if (not is_contig || not is_homogeneous) {*/
        /*            if (rank != root) {*/
        /*                position = 0;*/
        /*                mpi_errno = MPIR_Unpack_impl(tmp_buf, nbytes, &position, buffer,*/
        /*                                             count, datatype);*/
        /*            }*/
        /*        }*/
    } else {
        /* We use Knomial for intra node */
        MV2_Bcast_intra_node_function = &MPIR_Knomial_Bcast_intra_node_MV2;
/*        if (mv2_enable_shmem_bcast == 0) {*/
            /* Fall back to non-tuned version */
/*            MPIR_Bcast_intra_MV2(buffer, count, datatype, root, comm);*/
/*        } else {*/
            mpi_errno = MV2_Bcast_function(buffer, count, datatype, root,
                                           comm);

/*        }*/
    }


    return mpi_errno;

}
예제 #19
0
int Coll_allreduce_mvapich2::allreduce(const void *sendbuf,
    void *recvbuf,
    int count,
    MPI_Datatype datatype,
    MPI_Op op, MPI_Comm comm)
{

  int mpi_errno = MPI_SUCCESS;
  //int rank = 0,
  int comm_size = 0;

  comm_size = comm->size();
  //rank = comm->rank();

  if (count == 0) {
      return MPI_SUCCESS;
  }

  if (mv2_allreduce_thresholds_table == NULL)
    init_mv2_allreduce_tables_stampede();

  /* check if multiple threads are calling this collective function */

  MPI_Aint sendtype_size = 0;
  long nbytes = 0;
  int is_commutative = 0;
  MPI_Aint true_lb, true_extent;

  sendtype_size=datatype->size();
  nbytes = count * sendtype_size;

  datatype->extent(&true_lb, &true_extent);
  is_commutative = op->is_commutative();

  {
    int range = 0, range_threshold = 0, range_threshold_intra = 0;
    int is_two_level = 0;

    /* Search for the corresponding system size inside the tuning table */
    while ((range < (mv2_size_allreduce_tuning_table - 1)) &&
        (comm_size > mv2_allreduce_thresholds_table[range].numproc)) {
        range++;
    }
    /* Search for corresponding inter-leader function */
    /* skip mcast poiters if mcast is not available */
    if(mv2_allreduce_thresholds_table[range].mcast_enabled != 1){
        while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
            && ((mv2_allreduce_thresholds_table[range].
                inter_leader[range_threshold].MV2_pt_Allreducection
                == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2) ||
                (mv2_allreduce_thresholds_table[range].
                    inter_leader[range_threshold].MV2_pt_Allreducection
                    == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)
            )) {
            range_threshold++;
        }
    }
    while ((range_threshold < (mv2_allreduce_thresholds_table[range].size_inter_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max)
    && (mv2_allreduce_thresholds_table[range].inter_leader[range_threshold].max != -1)) {
        range_threshold++;
    }
    if(mv2_allreduce_thresholds_table[range].is_two_level_allreduce[range_threshold] == 1){
        is_two_level = 1;
    }
    /* Search for corresponding intra-node function */
    while ((range_threshold_intra <
        (mv2_allreduce_thresholds_table[range].size_intra_table - 1))
        && (nbytes >
    mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max)
    && (mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra].max !=
        -1)) {
        range_threshold_intra++;
    }

    MV2_Allreducection = mv2_allreduce_thresholds_table[range].inter_leader[range_threshold]
                                                                                .MV2_pt_Allreducection;

    MV2_Allreduce_intra_function = mv2_allreduce_thresholds_table[range].intra_node[range_threshold_intra]
                                                                                    .MV2_pt_Allreducection;

    /* check if mcast is ready, otherwise replace mcast with other algorithm */
    if((MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_redscat_gather_MV2)||
        (MV2_Allreducection == &MPIR_Allreduce_mcst_reduce_two_level_helper_MV2)){
        {
          MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
        if(is_two_level != 1) {
            MV2_Allreducection = &MPIR_Allreduce_pt2pt_rd_MV2;
        }
    }

    if(is_two_level == 1){
        // check if shm is ready, if not use other algorithm first
        if (is_commutative) {
          if(comm->get_leaders_comm()==MPI_COMM_NULL){
            comm->init_smp();
          }
          mpi_errno = MPIR_Allreduce_two_level_MV2(sendbuf, recvbuf, count,
                                                     datatype, op, comm);
                } else {
        mpi_errno = MPIR_Allreduce_pt2pt_rd_MV2(sendbuf, recvbuf, count,
            datatype, op, comm);
        }
    } else {
        mpi_errno = MV2_Allreducection(sendbuf, recvbuf, count,
            datatype, op, comm);
    }
  }

  //comm->ch.intra_node_done=0;

  return (mpi_errno);


}
예제 #20
0
int PMPI_Recv(void *buf, int count, MPI_Datatype datatype, int src, int tag, MPI_Comm comm, MPI_Status * status)
{
  int retval = 0;

  smpi_bench_end();
  if (comm == MPI_COMM_NULL) {
    retval = MPI_ERR_COMM;
  } else if (src == MPI_PROC_NULL) {
    if(status != MPI_STATUS_IGNORE){
      simgrid::smpi::Status::empty(status);
      status->MPI_SOURCE = MPI_PROC_NULL;
    }
    retval = MPI_SUCCESS;
  } else if (src!=MPI_ANY_SOURCE && (src >= comm->group()->size() || src <0)){
    retval = MPI_ERR_RANK;
  } else if ((count < 0) || (buf==nullptr && count > 0)) {
    retval = MPI_ERR_COUNT;
  } else if (datatype==MPI_DATATYPE_NULL || not datatype->is_valid()) {
    retval = MPI_ERR_TYPE;
  } else if(tag<0 && tag !=  MPI_ANY_TAG){
    retval = MPI_ERR_TAG;
  } else {
    int my_proc_id = simgrid::s4u::this_actor::get_pid();
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData("recv", src,
                                                       datatype->is_replayable() ? count : count * datatype->size(),
                                                       tag, simgrid::smpi::Datatype::encode(datatype)));

    simgrid::smpi::Request::recv(buf, count, datatype, src, tag, comm, status);
    retval = MPI_SUCCESS;

    // the src may not have been known at the beginning of the recv (MPI_ANY_SOURCE)
    int src_traced=0;
    if (status != MPI_STATUS_IGNORE) 
      src_traced = getPid(comm, status->MPI_SOURCE);
    else
      src_traced = getPid(comm, src);
    if (not TRACE_smpi_view_internals()) {
      TRACE_smpi_recv(src_traced, my_proc_id, tag);
    }
    
    TRACE_smpi_comm_out(my_proc_id);
  }

  smpi_bench_begin();
  return retval;
}
예제 #21
0
int PMPI_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr,
int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count,
MPI_Datatype target_datatype, MPI_Op op, MPI_Win win){
  int retval = 0;
  smpi_bench_end();
  if (win == MPI_WIN_NULL) {
    retval = MPI_ERR_WIN;
  } else if (target_rank == MPI_PROC_NULL) {
    retval = MPI_SUCCESS;
  } else if (target_rank <0){
    retval = MPI_ERR_RANK;
  } else if (win->dynamic()==0 && target_disp <0){
    //in case of dynamic window, target_disp can be mistakenly seen as negative, as it is an address
    retval = MPI_ERR_ARG;
  } else if ((origin_count < 0 || target_count < 0 || result_count <0) ||
             (origin_addr==nullptr && origin_count > 0 && op != MPI_NO_OP) ||
             (result_addr==nullptr && result_count > 0)){
    retval = MPI_ERR_COUNT;
  } else if (((target_datatype == MPI_DATATYPE_NULL) || (result_datatype == MPI_DATATYPE_NULL)) ||
            (((origin_datatype != MPI_DATATYPE_NULL) && (not origin_datatype->is_valid())) || (not target_datatype->is_valid()) || (not result_datatype->is_valid()))) {
    retval = MPI_ERR_TYPE;
  } else if (op == MPI_OP_NULL) {
    retval = MPI_ERR_OP;
  } else {
    int my_proc_id = simgrid::s4u::this_actor::get_pid();
    MPI_Group group;
    win->get_group(&group);
    TRACE_smpi_comm_in(my_proc_id, __func__,
                       new simgrid::instr::Pt2PtTIData(
                           "Get_accumulate", target_rank,
                           target_datatype->is_replayable() ? target_count : target_count * target_datatype->size(),
                           simgrid::smpi::Datatype::encode(target_datatype)));

    retval = win->get_accumulate( origin_addr, origin_count, origin_datatype, result_addr,
                                  result_count, result_datatype, target_rank, target_disp,
                                  target_count, target_datatype, op);

    TRACE_smpi_comm_out(my_proc_id);
  }
  smpi_bench_begin();
  return retval;
}
예제 #22
0
int Coll_reduce_scatter_mvapich2::reduce_scatter(const void *sendbuf, void *recvbuf, const int *recvcnts,
    MPI_Datatype datatype, MPI_Op op,
    MPI_Comm comm)
{
  int mpi_errno = MPI_SUCCESS;
  int i = 0, comm_size = comm->size(), total_count = 0, type_size =
      0, nbytes = 0;
  int is_commutative = 0;
  int* disps          = new int[comm_size];

  if(mv2_red_scat_thresholds_table==NULL)
    init_mv2_reduce_scatter_tables_stampede();

  is_commutative=(op==MPI_OP_NULL || op->is_commutative());
  for (i = 0; i < comm_size; i++) {
      disps[i] = total_count;
      total_count += recvcnts[i];
  }

  type_size=datatype->size();
  nbytes = total_count * type_size;

  if (is_commutative) {
    int range           = 0;
    int range_threshold = 0;

      /* Search for the corresponding system size inside the tuning table */
      while ((range < (mv2_size_red_scat_tuning_table - 1)) &&
          (comm_size > mv2_red_scat_thresholds_table[range].numproc)) {
          range++;
      }
      /* Search for corresponding inter-leader function */
      while ((range_threshold < (mv2_red_scat_thresholds_table[range].size_inter_table - 1))
          && (nbytes >
      mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max)
      && (mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].max !=
          -1)) {
          range_threshold++;
      }

      /* Set inter-leader pt */
      MV2_Red_scat_function =
          mv2_red_scat_thresholds_table[range].inter_leader[range_threshold].
          MV2_pt_Red_scat_function;

      mpi_errno = MV2_Red_scat_function(sendbuf, recvbuf,
          recvcnts, datatype,
          op, comm);
  } else {
      int is_block_regular = 1;
      for (i = 0; i < (comm_size - 1); ++i) {
          if (recvcnts[i] != recvcnts[i+1]) {
              is_block_regular = 0;
              break;
          }
      }
      int pof2 = 1;
      while (pof2 < comm_size) pof2 <<= 1;
      if (pof2 == comm_size && is_block_regular) {
          /* noncommutative, pof2 size, and block regular */
          MPIR_Reduce_scatter_non_comm_MV2(sendbuf, recvbuf,
              recvcnts, datatype,
              op, comm);
      }
      mpi_errno =  Coll_reduce_scatter_mpich_rdb::reduce_scatter(sendbuf, recvbuf,
          recvcnts, datatype,
          op, comm);
  }
  delete[] disps;
  return mpi_errno;

}
예제 #23
0
int Coll_scatter_mvapich2_two_level_direct::scatter(void *sendbuf,
                                      int sendcnt,
                                      MPI_Datatype sendtype,
                                      void *recvbuf,
                                      int recvcnt,
                                      MPI_Datatype recvtype,
                                      int root, MPI_Comm  comm)
{
    int comm_size, rank;
    int local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = -1;
    int mpi_errno = MPI_SUCCESS;
    int recvtype_size, sendtype_size, nbytes;
    void *tmp_buf = NULL;
    void *leader_scatter_buf = NULL;
    MPI_Status status;
    int leader_root, leader_of_root = -1;
    MPI_Comm shmem_comm, leader_comm;
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Scatter_intra_function==NULL)
      MV2_Scatter_intra_function=Coll_scatter_mpich::scatter;

    if(comm->get_leaders_comm()==MPI_COMM_NULL){
      comm->init_smp();
    }
    comm_size = comm->size();
    rank = comm->rank();

    if (((rank == root) && (recvcnt == 0))
        || ((rank != root) && (sendcnt == 0))) {
        return MPI_SUCCESS;
    }

    /* extract the rank,size information for the intra-node
     * communicator */
    shmem_comm = comm->get_intra_comm();
    local_rank = shmem_comm->rank();
    local_size = shmem_comm->size();

    if (local_rank == 0) {
        /* Node leader. Extract the rank, size information for the leader
         * communicator */
        leader_comm = comm->get_leaders_comm();
        leader_comm_size = leader_comm->size();
        leader_comm_rank = leader_comm->rank();
    }

    if (local_size == comm_size) {
        /* purely intra-node scatter. Just use the direct algorithm and we are done */
        mpi_errno = MPIR_Scatter_MV2_Direct(sendbuf, sendcnt, sendtype,
                                            recvbuf, recvcnt, recvtype,
                                            root, comm);

    } else {
        recvtype_size=recvtype->size();
        sendtype_size=sendtype->size();

        if (rank == root) {
            nbytes = sendcnt * sendtype_size;
        } else {
            nbytes = recvcnt * recvtype_size;
        }

        if (local_rank == 0) {
            /* Node leader, allocate tmp_buffer */
            tmp_buf = smpi_get_tmp_sendbuffer(nbytes * local_size);
        }

        leader_comm = comm->get_leaders_comm();
        int* leaders_map = comm->get_leaders_map();
        leader_of_root = comm->group()->rank(leaders_map[root]);
        leader_root = leader_comm->group()->rank(leaders_map[root]);
        /* leader_root is the rank of the leader of the root in leader_comm.
         * leader_root is to be used as the root of the inter-leader gather ops
         */

        if ((local_rank == 0) && (root != rank)
            && (leader_of_root == rank)) {
            /* The root of the scatter operation is not the node leader. Recv
             * data from the node leader */
            leader_scatter_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size);
            Request::recv(leader_scatter_buf, nbytes * comm_size, MPI_BYTE,
                             root, COLL_TAG_SCATTER, comm, &status);

        }

        if (rank == root && local_rank != 0) {
            /* The root of the scatter operation is not the node leader. Send
             * data to the node leader */
            Request::send(sendbuf, sendcnt * comm_size, sendtype,
                                     leader_of_root, COLL_TAG_SCATTER, comm
                                     );
        }

        if (leader_comm_size > 1 && local_rank == 0) {
          if (not comm->is_uniform()) {
            int* displs   = NULL;
            int* sendcnts = NULL;
            int* node_sizes;
            int i      = 0;
            node_sizes = comm->get_non_uniform_map();

            if (root != leader_of_root) {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * nbytes;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * nbytes;
                  sendcnts[i] = node_sizes[i] * nbytes;
                }
              }
              Colls::scatterv(leader_scatter_buf, sendcnts, displs, MPI_BYTE, tmp_buf, nbytes * local_size, MPI_BYTE,
                              leader_root, leader_comm);
            } else {
              if (leader_comm_rank == leader_root) {
                displs      = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts    = static_cast<int*>(xbt_malloc(sizeof(int) * leader_comm_size));
                sendcnts[0] = node_sizes[0] * sendcnt;
                displs[0]   = 0;

                for (i = 1; i < leader_comm_size; i++) {
                  displs[i]   = displs[i - 1] + node_sizes[i - 1] * sendcnt;
                  sendcnts[i] = node_sizes[i] * sendcnt;
                }
              }
              Colls::scatterv(sendbuf, sendcnts, displs, sendtype, tmp_buf, nbytes * local_size, MPI_BYTE, leader_root,
                              leader_comm);
            }
            if (leader_comm_rank == leader_root) {
              xbt_free(displs);
              xbt_free(sendcnts);
            }
            } else {
                if (leader_of_root != root) {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(leader_scatter_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                tmp_buf, nbytes * local_size,
                                                MPI_BYTE, leader_root,
                                                leader_comm);
                } else {
                    mpi_errno =
                        MPIR_Scatter_MV2_Direct(sendbuf, sendcnt * local_size,
                                                sendtype, tmp_buf,
                                                nbytes * local_size, MPI_BYTE,
                                                leader_root, leader_comm);

                }
            }
        }
        /* The leaders are now done with the inter-leader part. Scatter the data within the nodes */

        if (rank == root && recvbuf == MPI_IN_PLACE) {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                (void *)sendbuf, sendcnt, sendtype,
                                                0, shmem_comm);
        } else {
            mpi_errno = MV2_Scatter_intra_function(tmp_buf, nbytes, MPI_BYTE,
                                                recvbuf, recvcnt, recvtype,
                                                0, shmem_comm);
        }
    }

    /* check if multiple threads are calling this collective function */
    if (comm_size != local_size && local_rank == 0) {
        smpi_free_tmp_buffer(tmp_buf);
        if (leader_of_root == rank && root != rank) {
            smpi_free_tmp_buffer(leader_scatter_buf);
        }
    }
    return (mpi_errno);
}
예제 #24
0
int Coll_scatter_mvapich2::scatter(const void *sendbuf,
    int sendcnt,
    MPI_Datatype sendtype,
    void *recvbuf,
    int recvcnt,
    MPI_Datatype recvtype,
    int root, MPI_Comm comm)
{
  int range = 0, range_threshold = 0, range_threshold_intra = 0;
  int mpi_errno = MPI_SUCCESS;
  //   int mpi_errno_ret = MPI_SUCCESS;
  int rank, nbytes, comm_size;
  int partial_sub_ok = 0;
  int conf_index = 0;
     MPI_Comm shmem_comm;
  //    MPID_Comm *shmem_commptr=NULL;
  if(mv2_scatter_thresholds_table==NULL)
    init_mv2_scatter_tables_stampede();

  if(comm->get_leaders_comm()==MPI_COMM_NULL){
    comm->init_smp();
  }

  comm_size = comm->size();

  rank = comm->rank();

  if (rank == root) {
    int sendtype_size = sendtype->size();
    nbytes            = sendcnt * sendtype_size;
  } else {
    int recvtype_size = recvtype->size();
    nbytes            = recvcnt * recvtype_size;
  }

    // check if safe to use partial subscription mode
    if (comm->is_uniform()) {

        shmem_comm = comm->get_intra_comm();
        if (mv2_scatter_table_ppn_conf[0] == -1) {
            // Indicating user defined tuning
            conf_index = 0;
        }else{
          int local_size = shmem_comm->size();
          int i          = 0;
            do {
                if (local_size == mv2_scatter_table_ppn_conf[i]) {
                    conf_index = i;
                    partial_sub_ok = 1;
                    break;
                }
                i++;
            } while(i < mv2_scatter_num_ppn_conf);
        }
    }

  if (partial_sub_ok != 1) {
      conf_index = 0;
  }

  /* Search for the corresponding system size inside the tuning table */
  while ((range < (mv2_size_scatter_tuning_table[conf_index] - 1)) &&
      (comm_size > mv2_scatter_thresholds_table[conf_index][range].numproc)) {
      range++;
  }
  /* Search for corresponding inter-leader function */
  while ((range_threshold < (mv2_scatter_thresholds_table[conf_index][range].size_inter_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max)
  && (mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold].max != -1)) {
      range_threshold++;
  }

  /* Search for corresponding intra-node function */
  while ((range_threshold_intra <
      (mv2_scatter_thresholds_table[conf_index][range].size_intra_table - 1))
      && (nbytes >
  mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max)
  && (mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra].max !=
      -1)) {
      range_threshold_intra++;
  }

  MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold]
                                                                                      .MV2_pt_Scatter_function;

  if(MV2_Scatter_function == &MPIR_Scatter_mcst_wrap_MV2) {
#if defined(_MCST_SUPPORT_)
      if(comm->ch.is_mcast_ok == 1
          && mv2_use_mcast_scatter == 1
          && comm->ch.shmem_coll_ok == 1) {
          MV2_Scatter_function = &MPIR_Scatter_mcst_MV2;
      } else
#endif /*#if defined(_MCST_SUPPORT_) */
        {
          if(mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1].
              MV2_pt_Scatter_function != NULL) {
              MV2_Scatter_function = mv2_scatter_thresholds_table[conf_index][range].inter_leader[range_threshold + 1]
                                                                                                  .MV2_pt_Scatter_function;
          } else {
              /* Fallback! */
              MV2_Scatter_function = &MPIR_Scatter_MV2_Binomial;
          }
        }
  }

  if( (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Direct) ||
      (MV2_Scatter_function == &MPIR_Scatter_MV2_two_level_Binomial)) {
       if( comm->is_blocked()) {
             MV2_Scatter_intra_function = mv2_scatter_thresholds_table[conf_index][range].intra_node[range_threshold_intra]
                                .MV2_pt_Scatter_function;

             mpi_errno =
                   MV2_Scatter_function(sendbuf, sendcnt, sendtype,
                                        recvbuf, recvcnt, recvtype, root,
                                        comm);
         } else {
      mpi_errno = MPIR_Scatter_MV2_Binomial(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);

      }
  } else {
      mpi_errno = MV2_Scatter_function(sendbuf, sendcnt, sendtype,
          recvbuf, recvcnt, recvtype, root,
          comm);
  }
  return (mpi_errno);
}