Пример #1
0
int smpi_coll_tuned_allgather_ompi(void *sbuf, int scount, 
                                              MPI_Datatype sdtype,
                                              void* rbuf, int rcount, 
                                              MPI_Datatype rdtype, 
                                              MPI_Comm  comm
                                              )
{
    int communicator_size, pow2_size;
    size_t dsize, total_dsize;

    communicator_size = smpi_comm_size(comm);

    /* Special case for 2 processes */
    if (communicator_size == 2) {
        return smpi_coll_tuned_allgather_pair (sbuf, scount, sdtype, 
                                                          rbuf, rcount, rdtype, 
                                                          comm/*, module*/);
    }

    /* Determine complete data size */
    dsize=smpi_datatype_size(sdtype);
    total_dsize = dsize * scount * communicator_size;   
   
    for (pow2_size  = 1; pow2_size < communicator_size; pow2_size <<=1); 

    /* Decision based on MX 2Gb results from Grig cluster at 
       The University of Tennesse, Knoxville 
       - if total message size is less than 50KB use either bruck or 
       recursive doubling for non-power of two and power of two nodes, 
       respectively.
       - else use ring and neighbor exchange algorithms for odd and even 
       number of nodes, respectively.
    */
    if (total_dsize < 50000) {
        if (pow2_size == communicator_size) {
            return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        } else {
            return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
                                                         rbuf, rcount, rdtype, 
                                                         comm);
        }
    } else {
        if (communicator_size % 2) {
            return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                        rbuf, rcount, rdtype, 
                                                        comm);
        } else {
            return  smpi_coll_tuned_allgather_ompi_neighborexchange(sbuf, scount, sdtype,
                                                                     rbuf, rcount, rdtype,
                                                                     comm);
        }
    }
   
#if defined(USE_MPICH2_DECISION)
    /* Decision as in MPICH-2 
       presented in Thakur et.al. "Optimization of Collective Communication 
       Operations in MPICH", International Journal of High Performance Computing 
       Applications, Vol. 19, No. 1, 49-66 (2005)
       - for power-of-two processes and small and medium size messages 
       (up to 512KB) use recursive doubling
       - for non-power-of-two processes and small messages (80KB) use bruck,
       - for everything else use ring.
    */
    if ((pow2_size == communicator_size) && (total_dsize < 524288)) {
        return smpi_coll_tuned_allgather_rdb(sbuf, scount, sdtype, 
                                                                 rbuf, rcount, rdtype, 
                                                                 comm);
    } else if (total_dsize <= 81920) { 
        return smpi_coll_tuned_allgather_bruck(sbuf, scount, sdtype, 
                                                     rbuf, rcount, rdtype,
                                                     comm);
    } 
    return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype, 
                                                rbuf, rcount, rdtype,
                                                comm);
#endif  /* defined(USE_MPICH2_DECISION) */
}
int 
smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
                                                 MPI_Datatype sdtype,
                                                 void* rbuf, int rcount,
                                                 MPI_Datatype rdtype,
                                                 MPI_Comm comm
)
{
   int line = -1;
   int rank, size;
   int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
   int i, even_rank;
   int err = 0;
   ptrdiff_t slb, rlb, sext, rext;
   char *tmpsend = NULL, *tmprecv = NULL;

   size = smpi_comm_size(comm);
   rank = smpi_comm_rank(comm);

   if (size % 2) {
      XBT_DEBUG(
                   "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                   size);
      return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
                                                  rbuf, rcount, rdtype,
                                                  comm);
   }

   XBT_DEBUG(
                "coll:tuned:allgather_intra_neighborexchange rank %d", rank);

   err = smpi_datatype_extent (sdtype, &slb, &sext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   err = smpi_datatype_extent (rdtype, &rlb, &rext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   /* Initialization step:
      - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
        of receive buffer
   */
   tmprecv = (char*) rbuf + rank * rcount * rext;
   if (MPI_IN_PLACE != sbuf) {
      tmpsend = (char*) sbuf;
      smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
   } 

   /* Determine neighbors, order in which blocks will arrive, etc. */
   even_rank = !(rank % 2);
   if (even_rank) {
      neighbor[0] = (rank + 1) % size;
      neighbor[1] = (rank - 1 + size) % size;
      recv_data_from[0] = rank;
      recv_data_from[1] = rank;
      offset_at_step[0] = (+2);
      offset_at_step[1] = (-2);
   } else {
      neighbor[0] = (rank - 1 + size) % size;
      neighbor[1] = (rank + 1) % size;
      recv_data_from[0] = neighbor[0];
      recv_data_from[1] = neighbor[0];
      offset_at_step[0] = (-2);
      offset_at_step[1] = (+2);
   }

   /* Communication loop:
      - First step is special: exchange a single block with neighbor[0].
      - Rest of the steps: 
        update recv_data_from according to offset, and 
        exchange two blocks with appropriate neighbor.
        the send location becomes previous receve location.
   */
   tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
   tmpsend = (char*)rbuf + rank * rcount * rext;
   /* Sendreceive */
   smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  tmprecv, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  comm, MPI_STATUS_IGNORE);

   /* Determine initial sending location */
   if (even_rank) {
      send_data_from = rank;
   } else {
      send_data_from = recv_data_from[0];
   }

   for (i = 1; i < (size / 2); i++) {
      const int i_parity = i % 2;
      recv_data_from[i_parity] = 
         (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

      tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
      tmpsend = (char*)rbuf + send_data_from * rcount * rext;
      
      /* Sendreceive */
      smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
                                     neighbor[i_parity], 
                                     COLL_TAG_ALLGATHER,
                                     tmprecv, 2 * rcount, rdtype,
                                     neighbor[i_parity],
                                     COLL_TAG_ALLGATHER,
                                     comm, MPI_STATUS_IGNORE);

      send_data_from = recv_data_from[i_parity];
   }

   return MPI_SUCCESS;

 err_hndl:
   XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
                __FILE__, line, err, rank);
   return err;
}