int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf, *tmp_results;
    int mpi_errno = MPI_SUCCESS;
    int dis[2], blklens[2], total_count, dst;
    int mask, dst_tree_root, my_tree_root, j, k;
    int received;
    MPI_Datatype sendtype, recvtype;
    int nprocs_completed, tmp_mask, tree_root, is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */

            /* need to allocate temporary buffer to receive incoming data*/
            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);

            /* need to allocate another temporary buffer to accumulate
               results */
            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_results = (void *)((char*)tmp_results - true_lb);

            /* copy sendbuf into tmp_results */
            if (sendbuf != MPI_IN_PLACE)
                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);
            else
                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);

            if (mpi_errno) return(mpi_errno);

            mask = 0x1;
            i = 0;
            while (mask < comm_size) {
                dst = rank ^ mask;

                dst_tree_root = dst >> i;
                dst_tree_root <<= i;

                my_tree_root = rank >> i;
                my_tree_root <<= i;

                /* At step 1, processes exchange (n-n/p) amount of
                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
                   amount of data, and so forth. We use derived datatypes for this.

                   At each step, a process does not need to send data
                   indexed from my_tree_root to
                   my_tree_root+mask-1. Similarly, a process won't receive
                   data indexed from dst_tree_root to dst_tree_root+mask-1. */

                /* calculate sendtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<my_tree_root; j++)
                    blklens[0] += recvcounts[j];
                for (j=my_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&sendtype);

                /* calculate recvtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<dst_tree_root && j<comm_size; j++)
                    blklens[0] += recvcounts[j];
                for (j=dst_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&recvtype);

                received = 0;
                if (dst < comm_size) {
                    /* tmp_results contains data to be sent in each step. Data is
                       received in tmp_recvbuf and then accumulated into
                       tmp_results. accumulation is done later below.   */ 

                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
                                                 COLL_TAG_SCATTER,
                                                 tmp_recvbuf, 1, recvtype, dst,
                                                 COLL_TAG_SCATTER, comm,
                                                 MPI_STATUS_IGNORE);
                    received = 1;
                }

                /* if some processes in this process's subtree in this step
                   did not have any destination process to communicate with
                   because of non-power-of-two, we need to send them the
                   result. We use a logarithmic recursive-halfing algorithm
                   for this. */

                if (dst_tree_root + mask > comm_size) {
                    nprocs_completed = comm_size - my_tree_root - mask;
                    /* nprocs_completed is the number of processes in this
                       subtree that have all the data. Send data to others
                       in a tree fashion. First find root of current tree
                       that is being divided into two. k is the number of
                       least-significant bits in this process's rank that
                       must be zeroed out to find the rank of the root */ 
                    j = mask;
                    k = 0;
                    while (j) {
                        j >>= 1;
                        k++;
                    }
                    k--;

                    tmp_mask = mask >> 1;
                    while (tmp_mask) {
                        dst = rank ^ tmp_mask;

                        tree_root = rank >> k;
                        tree_root <<= k;

                        /* send only if this proc has data and destination
                           doesn't have data. at any step, multiple processes
                           can send if they have the data */
                        if ((dst > rank) && 
                            (rank < tree_root + nprocs_completed)
                            && (dst >= tree_root + nprocs_completed)) {
                            /* send the current result */
                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
                                                     dst, COLL_TAG_SCATTER,
                                                     comm);
                        }
                        /* recv only if this proc. doesn't have data and sender
                           has data */
                        else if ((dst < rank) && 
                                 (dst < tree_root + nprocs_completed) &&
                                 (rank >= tree_root + nprocs_completed)) {
                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
                                                     COLL_TAG_SCATTER,
                                                     comm, MPI_STATUS_IGNORE); 
                            received = 1;
                        }
                        tmp_mask >>= 1;
                        k--;
                    }
                }

                /* The following reduction is done here instead of after 
                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
                   because to do it above, in the noncommutative 
                   case, we would need an extra temp buffer so as not to
                   overwrite temp_recvbuf, because temp_recvbuf may have
                   to be communicated to other processes in the
                   non-power-of-two case. To avoid that extra allocation,
                   we do the reduce here. */
                if (received) {
                    if (is_commutative || (dst_tree_root < my_tree_root)) {
                        {
			         smpi_op_apply(op, 
                               tmp_recvbuf, tmp_results, &blklens[0],
			       &datatype); 
			        smpi_op_apply(op, 
                               ((char *)tmp_recvbuf + dis[1]*extent),
			       ((char *)tmp_results + dis[1]*extent),
			       &blklens[1], &datatype); 
                        }
                    }
                    else {
                        {
			         smpi_op_apply(op,
                                   tmp_results, tmp_recvbuf, &blklens[0],
                                   &datatype); 
			         smpi_op_apply(op,
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }
int 
smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
                                                  MPI_Datatype sdtype,
                                                  void* rbuf, int *rcounts, int *rdispls,
                                                  MPI_Datatype rdtype,
                                                  MPI_Comm comm)
{
    int line = -1;
    int rank, size;
    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  
    int i, even_rank;
    int err = 0;
    ptrdiff_t slb, rlb, sext, rext;
    char *tmpsend = NULL, *tmprecv = NULL;


    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    if (size % 2) {
        XBT_DEBUG(
                     "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                     size);
        return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
                                                     rbuf, rcounts, 
                                                     rdispls, rdtype,
                                                     comm);
    }

    XBT_DEBUG(
                 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);

    err = smpi_datatype_extent (sdtype, &slb, &sext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    err = smpi_datatype_extent (rdtype, &rlb, &rext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    /* Initialization step:
       - if send buffer is not MPI_IN_PLACE, copy send buffer to 
       the appropriate block of receive buffer
    */
    tmprecv = (char*) rbuf + rdispls[rank] * rext;
    if (MPI_IN_PLACE != sbuf) {
        tmpsend = (char*) sbuf;
        err = smpi_datatype_copy(tmpsend, scount, sdtype, 
                              tmprecv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
    } 

    /* Determine neighbors, order in which blocks will arrive, etc. */
    even_rank = !(rank % 2);
    if (even_rank) {
        neighbor[0] = (rank + 1) % size;
        neighbor[1] = (rank - 1 + size) % size;
        recv_data_from[0] = rank;
        recv_data_from[1] = rank;
        offset_at_step[0] = (+2);
        offset_at_step[1] = (-2);
    } else {
        neighbor[0] = (rank - 1 + size) % size;
        neighbor[1] = (rank + 1) % size;
        recv_data_from[0] = neighbor[0];
        recv_data_from[1] = neighbor[0];
        offset_at_step[0] = (-2);
        offset_at_step[1] = (+2);
    }

    /* Communication loop:
       - First step is special: exchange a single block with neighbor[0].
       - Rest of the steps: 
       update recv_data_from according to offset, and 
       exchange two blocks with appropriate neighbor.
       the send location becomes previous receve location.
       Note, we need to create indexed datatype to send and receive these
       blocks properly.
    */
    tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
    tmpsend = (char*)rbuf + rdispls[rank] * rext;
    smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   tmprecv, rcounts[neighbor[0]], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   comm, MPI_STATUS_IGNORE);



  
   
    /* Determine initial sending counts and displacements*/
    if (even_rank) {
        send_data_from = rank;
    } else {
        send_data_from = recv_data_from[0];
    }

    for (i = 1; i < (size / 2); i++) {
        MPI_Datatype new_rdtype, new_sdtype;
        int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
        const int i_parity = i % 2;
        recv_data_from[i_parity] = 
            (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

        /* Create new indexed types for sending and receiving.
           We are sending data from ranks (send_data_from) and (send_data_from+1)
           We are receiving data from ranks (recv_data_from[i_parity]) and
           (recv_data_from[i_parity]+1).
        */
        
        new_scounts[0] = rcounts[send_data_from];
        new_scounts[1] = rcounts[(send_data_from + 1)];
        new_sdispls[0] = rdispls[send_data_from];
        new_sdispls[1] = rdispls[(send_data_from + 1)];
        err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, 
                                      &new_sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_sdtype);

        new_rcounts[0] = rcounts[recv_data_from[i_parity]];
        new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
        new_rdispls[0] = rdispls[recv_data_from[i_parity]];
        new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
        err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, 
                                      &new_rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_rdtype);
      
        tmprecv = (char*)rbuf;
        tmpsend = (char*)rbuf;
      
        /* Sendreceive */
        smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       tmprecv, 1, new_rdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       comm, MPI_STATUS_IGNORE);

        send_data_from = recv_data_from[i_parity];
      
        smpi_datatype_free(&new_sdtype);
        smpi_datatype_free(&new_rdtype);
    }

    return MPI_SUCCESS;

 err_hndl:
    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank);
    return err;
}
Beispiel #3
0
/*****************************************************************************

 * Function: alltoall_bruck

 * Return: int

 * inputs:
    send_buff: send input buffer
    send_count: number of elements to send
    send_type: data type of elements being sent
    recv_buff: receive output buffer
    recv_count: number of elements to received
    recv_type: data type of elements being received
    comm: communicator

 * Descrp: Function realizes the alltoall operation using the bruck algorithm.

 * Auther: MPICH / modified by Ahmad Faraj

 ****************************************************************************/
int
smpi_coll_tuned_alltoall_bruck(void *send_buff, int send_count,
                               MPI_Datatype send_type, void *recv_buff,
                               int recv_count, MPI_Datatype recv_type,
                               MPI_Comm comm)
{
    MPI_Status status;
    MPI_Aint extent;
    MPI_Datatype new_type;

    int *blocks_length, *disps;
    int i, src, dst, rank, num_procs, count, remainder, block, position;
    int pack_size, tag = COLL_TAG_ALLTOALL, pof2 = 1;


    char *tmp_buff;
    char *send_ptr = (char *) send_buff;
    char *recv_ptr = (char *) recv_buff;

    num_procs = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent = smpi_datatype_get_extent(recv_type);

    tmp_buff = (char *) smpi_get_tmp_sendbuffer(num_procs * recv_count * extent);
    disps = (int *) xbt_malloc(sizeof(int) * num_procs);
    blocks_length = (int *) xbt_malloc(sizeof(int) * num_procs);

    smpi_mpi_sendrecv(send_ptr + rank * send_count * extent,
                      (num_procs - rank) * send_count, send_type, rank, tag,
                      recv_ptr, (num_procs - rank) * recv_count, recv_type, rank,
                      tag, comm, &status);

    smpi_mpi_sendrecv(send_ptr, rank * send_count, send_type, rank, tag,
                      recv_ptr + (num_procs - rank) * recv_count * extent,
                      rank * recv_count, recv_type, rank, tag, comm, &status);



    MPI_Pack_size(send_count * num_procs, send_type, comm, &pack_size);

    while (pof2 < num_procs) {
        dst = (rank + pof2) % num_procs;
        src = (rank - pof2 + num_procs) % num_procs;


        count = 0;
        for (block = 1; block < num_procs; block++)
            if (block & pof2) {
                blocks_length[count] = send_count;
                disps[count] = block * send_count;
                count++;
            }

        MPI_Type_indexed(count, blocks_length, disps, recv_type, &new_type);
        smpi_datatype_commit(&new_type);

        position = 0;
        MPI_Pack(recv_buff, 1, new_type, tmp_buff, pack_size, &position, comm);

        smpi_mpi_sendrecv(tmp_buff, position, MPI_PACKED, dst, tag, recv_buff, 1,
                          new_type, src, tag, comm, &status);
        smpi_datatype_unuse(new_type);

        pof2 *= 2;
    }

    free(disps);
    free(blocks_length);

    smpi_mpi_sendrecv(recv_ptr + (rank + 1) * recv_count * extent,
                      (num_procs - rank - 1) * recv_count, send_type,
                      rank, tag, tmp_buff, (num_procs - rank - 1) * recv_count,
                      recv_type, rank, tag, comm, &status);

    smpi_mpi_sendrecv(recv_ptr, (rank + 1) * recv_count, send_type, rank, tag,
                      tmp_buff + (num_procs - rank - 1) * recv_count * extent,
                      (rank + 1) * recv_count, recv_type, rank, tag, comm, &status);


    for (i = 0; i < num_procs; i++)
        smpi_mpi_sendrecv(tmp_buff + i * recv_count * extent, recv_count, send_type,
                          rank, tag,
                          recv_ptr + (num_procs - i - 1) * recv_count * extent,
                          recv_count, recv_type, rank, tag, comm, &status);

    smpi_free_tmp_buffer(tmp_buff);
    return MPI_SUCCESS;
}