Exemple #1
0
/*
 *	scatter_intra
 *
 *	Function:	- basic scatter operation
 *	Accepts:	- same arguments as MPI_Scatter()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
smpi_coll_tuned_scatter_ompi_basic_linear(void *sbuf, int scount,
					   MPI_Datatype sdtype,
					   void *rbuf, int rcount,
					   MPI_Datatype rdtype,
					   int root,
					   MPI_Comm comm
					   )
{
    int i, rank, size, err;
    char *ptmp;
    ptrdiff_t lb, incr;

    /* Initialize */

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);

    /* If not root, receive data. */

    if (rank != root) {
        smpi_mpi_recv(rbuf, rcount, rdtype, root,
                                COLL_TAG_SCATTER,
                                comm, MPI_STATUS_IGNORE);
        return MPI_SUCCESS;
    }

    /* I am the root, loop sending data. */

    err = smpi_datatype_extent(sdtype, &lb, &incr);
    if (MPI_SUCCESS != err) {
        return MPI_ERR_OTHER;
    }

    incr *= scount;
    for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {

        /* simple optimization */

        if (i == rank) {
            if (MPI_IN_PLACE != rbuf) {
                err =
                    smpi_datatype_copy(ptmp, scount, sdtype, rbuf, rcount,
                                    rdtype);
            }
        } else {
            smpi_mpi_send(ptmp, scount, sdtype, i,
                                    COLL_TAG_SCATTER,
                                     comm);
        }
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* All done */

    return MPI_SUCCESS;
}
Exemple #2
0
/**
 * Alltoall basic_linear (STARMPI:alltoall-simple)
 **/
int smpi_coll_tuned_alltoall_basic_linear(void *sendbuf, int sendcount,
                                          MPI_Datatype sendtype,
                                          void *recvbuf, int recvcount,
                                          MPI_Datatype recvtype,
                                          MPI_Comm comm)
{
  int system_tag = 888;
  int i, rank, size, err, count;
  MPI_Aint lb = 0, sendext = 0, recvext = 0;
  MPI_Request *requests;

  /* Initialize. */
  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  XBT_DEBUG("<%d> algorithm alltoall_basic_linear() called.", rank);
  smpi_datatype_extent(sendtype, &lb, &sendext);
  smpi_datatype_extent(recvtype, &lb, &recvext);
  /* simple optimization */
  err = smpi_datatype_copy((char *)sendbuf + rank * sendcount * sendext, 
                           sendcount, sendtype, 
                           (char *)recvbuf + rank * recvcount * recvext, 
                           recvcount, recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    /* Post all receives first -- a simple optimization */
    count = 0;
    for (i = (rank + 1) % size; i != rank; i = (i + 1) % size) {
      requests[count] =
          smpi_irecv_init((char *)recvbuf + i * recvcount * recvext, recvcount, 
                          recvtype, i, system_tag, comm);
      count++;
    }
    /* Now post all sends in reverse order
     *   - We would like to minimize the search time through message queue
     *     when messages actually arrive in the order in which they were posted.
     * TODO: check the previous assertion
     */
    for (i = (rank + size - 1) % size; i != rank; i = (i + size - 1) % size) {
      requests[count] =
          smpi_isend_init((char *)sendbuf + i * sendcount * sendext, sendcount,
                          sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    smpi_mpi_startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
    for(i = 0; i < count; i++) {
      if(requests[i]!=MPI_REQUEST_NULL) smpi_mpi_request_free(&requests[i]);
    }
    xbt_free(requests);
  }
  return err;
}
Exemple #3
0
int
smpi_coll_tuned_reduce_ompi_basic_linear(void *sbuf, void *rbuf, int count,
                                          MPI_Datatype dtype,
                                          MPI_Op op,
                                          int root,
                                          MPI_Comm comm)
{
    int i, rank, size;
    ptrdiff_t true_extent, lb, extent;
    char *free_buffer = NULL;
    char *pml_buffer = NULL;
    char *inplace_temp = NULL;
    char *inbuf;

    /* Initialize */

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);

    XBT_DEBUG("coll:tuned:reduce_intra_basic_linear rank %d", rank);

    /* If not root, send data to the root. */

    if (rank != root) {
        smpi_mpi_send(sbuf, count, dtype, root,
                                COLL_TAG_REDUCE,
                                comm);
        return -1;
    }

    /* see discussion in ompi_coll_basic_reduce_lin_intra about 
       extent and true extent */
    /* for reducing buffer allocation lengths.... */

    smpi_datatype_extent(dtype, &lb, &extent);
    true_extent = smpi_datatype_get_extent(dtype);

    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
        inplace_temp = (char*)malloc(true_extent + (count - 1) * extent);
        if (NULL == inplace_temp) {
            return -1;
        }
        rbuf = inplace_temp - lb;
    }

    if (size > 1) {
        free_buffer = (char*)malloc(true_extent + (count - 1) * extent);
        pml_buffer = free_buffer - lb;
    }

    /* Initialize the receive buffer. */

    if (rank == (size - 1)) {
        smpi_datatype_copy((char*)sbuf, count, dtype,(char*)rbuf, count, dtype);
    } else {
        smpi_mpi_recv(rbuf, count, dtype, size - 1,
                                COLL_TAG_REDUCE, comm,
                                MPI_STATUS_IGNORE);
    }

    /* Loop receiving and calling reduction function (C or Fortran). */

    for (i = size - 2; i >= 0; --i) {
        if (rank == i) {
            inbuf = (char*)sbuf;
        } else {
            smpi_mpi_recv(pml_buffer, count, dtype, i,
                                    COLL_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE);
            inbuf = pml_buffer;
        }

        /* Perform the reduction */
        smpi_op_apply(op, inbuf, rbuf, &count, &dtype);
    }

    if (NULL != inplace_temp) {
        smpi_datatype_copy(inplace_temp, count, dtype,(char*)sbuf
                                                  ,count , dtype);
        free(inplace_temp);
    }
    if (NULL != free_buffer) {
        free(free_buffer);
    }

    /* All done */
    return MPI_SUCCESS;
}
Exemple #4
0
int smpi_coll_tuned_allgatherv_mpich_rdb (
  void *sendbuf,
  int sendcount,
  MPI_Datatype sendtype,
  void *recvbuf,
  int *recvcounts,
  int *displs,
  MPI_Datatype recvtype,
  MPI_Comm comm)
{
  int        comm_size, rank, j, i;
  MPI_Status status;
  MPI_Aint  recvtype_extent, recvtype_true_extent, recvtype_true_lb;
  int curr_cnt, dst, total_count;
  void *tmp_buf, *tmp_buf_rl;
  int mask, dst_tree_root, my_tree_root, position,
    send_offset, recv_offset, last_recv_cnt=0, nprocs_completed, k,
    offset, tmp_mask, tree_root;

  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  total_count = 0;
  for (i=0; i<comm_size; i++)
    total_count += recvcounts[i];

  if (total_count == 0) return MPI_ERR_COUNT;

  recvtype_extent=smpi_datatype_get_extent( recvtype);

  /* need to receive contiguously into tmp_buf because
     displs could make the recvbuf noncontiguous */

  smpi_datatype_extent(recvtype, &recvtype_true_lb, &recvtype_true_extent);

  tmp_buf_rl= (void*)smpi_get_tmp_sendbuffer(total_count*(MAX(recvtype_true_extent,recvtype_extent)));

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf_rl - recvtype_true_lb);

  /* copy local data into right location in tmp_buf */
  position = 0;
  for (i=0; i<rank; i++) position += recvcounts[i];
  if (sendbuf != MPI_IN_PLACE)
  {
    smpi_datatype_copy(sendbuf, sendcount, sendtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  else
  {
    /* if in_place specified, local data is found in recvbuf */
    smpi_datatype_copy(((char *)recvbuf +
                        displs[rank]*recvtype_extent),
                       recvcounts[rank], recvtype,
                       ((char *)tmp_buf + position*
                        recvtype_extent),
                       recvcounts[rank], recvtype);
  }
  curr_cnt = recvcounts[rank];

  mask = 0x1;
  i = 0;
  while (mask < comm_size) {
    dst = rank ^ mask;

    /* find offset into send and recv buffers. zero out
       the least significant "i" bits of rank and dst to
       find root of src and dst subtrees. Use ranks of
       roots as index to send from and recv into buffer */

    dst_tree_root = dst >> i;
    dst_tree_root <<= i;

    my_tree_root = rank >> i;
    my_tree_root <<= i;

    if (dst < comm_size) {
      send_offset = 0;
      for (j=0; j<my_tree_root; j++)
        send_offset += recvcounts[j];

      recv_offset = 0;
      for (j=0; j<dst_tree_root; j++)
        recv_offset += recvcounts[j];

      smpi_mpi_sendrecv(((char *)tmp_buf + send_offset * recvtype_extent),
                        curr_cnt, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        ((char *)tmp_buf + recv_offset * recvtype_extent),
                        total_count - recv_offset, recvtype, dst,
                        COLL_TAG_ALLGATHERV,
                        comm, &status);
      /* for convenience, recv is posted for a bigger amount
         than will be sent */
      last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
      curr_cnt += last_recv_cnt;
    }

    /* if some processes in this process's subtree in this step
       did not have any destination process to communicate with
       because of non-power-of-two, we need to send them the
       data that they would normally have received from those
       processes. That is, the haves in this subtree must send to
       the havenots. We use a logarithmic
       recursive-halfing algorithm for this. */

    /* This part of the code will not currently be
       executed because we are not using recursive
       doubling for non power of two. Mark it as experimental
       so that it doesn't show up as red in the coverage
       tests. */

    /* --BEGIN EXPERIMENTAL-- */
    if (dst_tree_root + mask > comm_size) {
      nprocs_completed = comm_size - my_tree_root - mask;
      /* nprocs_completed is the number of processes in this
         subtree that have all the data. Send data to others
         in a tree fashion. First find root of current tree
         that is being divided into two. k is the number of
         least-significant bits in this process's rank that
         must be zeroed out to find the rank of the root */
      j = mask;
      k = 0;
      while (j) {
        j >>= 1;
        k++;
      }
      k--;

      tmp_mask = mask >> 1;

      while (tmp_mask) {
        dst = rank ^ tmp_mask;

        tree_root = rank >> k;
        tree_root <<= k;

        /* send only if this proc has data and destination
           doesn't have data. at any step, multiple processes
           can send if they have the data */
        if ((dst > rank) &&
            (rank < tree_root + nprocs_completed)
            && (dst >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];
          offset *= recvtype_extent;

          smpi_mpi_send(((char *)tmp_buf + offset),
                        last_recv_cnt,
                        recvtype, dst,
                        COLL_TAG_ALLGATHERV, comm);
          /* last_recv_cnt was set in the previous
             receive. that's the amount of data to be
             sent now. */
        }
        /* recv only if this proc. doesn't have data and sender
           has data */
        else if ((dst < rank) &&
                 (dst < tree_root + nprocs_completed) &&
                 (rank >= tree_root + nprocs_completed)) {

          offset = 0;
          for (j=0; j<(my_tree_root+mask); j++)
            offset += recvcounts[j];

          smpi_mpi_recv(((char *)tmp_buf + offset * recvtype_extent),
                        total_count - offset, recvtype,
                        dst, COLL_TAG_ALLGATHERV,
                        comm, &status);
          /* for convenience, recv is posted for a
             bigger amount than will be sent */
          last_recv_cnt=smpi_mpi_get_count(&status, recvtype);
          curr_cnt += last_recv_cnt;
        }
        tmp_mask >>= 1;
        k--;
      }
    }
    /* --END EXPERIMENTAL-- */

    mask <<= 1;
    i++;
  }
Exemple #5
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate 
 * to keep the optimized loop happy.
 */
int smpi_coll_tuned_ompi_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    MPI_Datatype datatype, MPI_Op  op,
                                    int root, MPI_Comm comm,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound, segment_increment;
    MPI_Request  reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    smpi_datatype_extent( datatype, &lower_bound, &extent);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    segment_increment = count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    XBT_DEBUG( "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment, max_outstanding_reqs);

    rank = smpi_comm_rank(comm);

    /* non-leaf nodes - wait for children to send me data & forward up 
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t true_extent, real_segment_size;
        true_extent=smpi_datatype_get_extent( datatype);

        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            accumbuf_free = (char*)malloc(true_extent + 
                                          (original_count - 1) * extent);
            if (accumbuf_free == NULL) { 
                line = __LINE__; ret = -1; goto error_hndl; 
            }
            accumbuf = accumbuf_free - lower_bound;
        } 

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!smpi_op_is_commute(op)) {
            smpi_datatype_copy(
                                                (char*)sendtmpbuf, original_count, datatype,
                                                (char*)accumbuf, original_count, datatype);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = true_extent + (count_by_segment - 1) * extent;
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) { 
            line = __LINE__; ret = -1; goto error_hndl; 
        }
        inbuf[0] = inbuf_free[0] - lower_bound;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) { 
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - lower_bound;
        } 

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - count_by_segment * segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and 
                         * commutative operations we might be able to irecv 
                         * directly into the accumulate buffer so that we can 
                         * reduce(op) this with our sendbuf in one step as 
                         * ompi_op_reduce only has two buffer pointers, 
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or 
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + segindex * segment_increment;
                        }
                    }

                    reqs[inbi]=smpi_mpi_irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i], 
                                             COLL_TAG_REDUCE, comm
                                             );
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be 
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                smpi_mpi_waitall( 1, &reqs[inbi ^ 1], 
                                             MPI_STATUSES_IGNORE );
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data 
                     * with the data we recvd from down stream (but only 
                     * the operation is commutative and if we are not root and 
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (smpi_op_is_commute(op)) && 
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + segindex * segment_increment;
                        }
                    }
                    /* apply operation */
                    smpi_op_apply(op, local_op_buffer, 
                                   accumbuf + segindex * segment_increment, 
                                   &recvcount, &datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (segindex-1) * segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment;
                        }
                    }
                    smpi_op_apply(op, local_op_buffer, accumulator, &prevcount, 
                                   &datatype );

                    /* all reduced on available data this step (i) complete, 
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        smpi_mpi_send( accumulator, prevcount, 
                                                  datatype, tree->tree_prev, 
                                                  COLL_TAG_REDUCE,
                                                  comm);
                    }

                    /* we stop when segindex = number of segments 
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes 
       Depending on the value of max_outstanding_reqs and 
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of 
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size 
       for the current communication, synchronization should be used only 
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding 
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) || 
            (num_segments <= max_outstanding_reqs)) {
            
            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                smpi_mpi_send((char*)sendbuf + 
                                         segindex * segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev, 
                                         COLL_TAG_REDUCE,
                                         comm) ;
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;
            MPI_Request* sreq = NULL;

            sreq = (MPI_Request*) calloc( max_outstanding_reqs,
                                              sizeof(MPI_Request ) );
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                sreq[segindex]=smpi_mpi_isend((char*)sendbuf +
                                          segindex * segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm);
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                smpi_mpi_wait(&sreq[creq], MPI_STATUS_IGNORE);
                sreq[creq] = MPI_REQUEST_NULL;

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                sreq[creq]=smpi_mpi_isend((char*)sendbuf + 
                                          segindex * segment_increment, 
                                          count_by_segment, datatype, 
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm );
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            smpi_mpi_waitall( max_outstanding_reqs, sreq, 
                                         MPI_STATUSES_IGNORE );

            /* free requests */
            free(sreq);
        }
    }
    return MPI_SUCCESS;

 error_hndl:  /* error handler */
    XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n", 
                   rank, __FILE__, line, ret );
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    return ret;
}
Exemple #6
0
/*
 * reduce_intra_in_order_binary 
 * 
 * Function:      Logarithmic reduce operation for non-commutative operations.
 * Acecpts:       same as MPI_Reduce()
 * Returns:       MPI_SUCCESS or error code
 */
int smpi_coll_tuned_reduce_ompi_in_order_binary( void *sendbuf, void *recvbuf,
                                                  int count, 
                                                  MPI_Datatype datatype,
                                                  MPI_Op  op, int root,
                                                  MPI_Comm  comm)
{
    uint32_t segsize=0;
    int ret;
    int rank, size, io_root;
    int segcount = count;
    void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL;
    size_t typelng;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    XBT_DEBUG("coll:tuned:reduce_intra_in_order_binary rank %d ss %5d",
                 rank, segsize);

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    typelng=smpi_datatype_size( datatype);
    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount );

    /* An in-order binary tree must use root (size-1) to preserve the order of
       operations.  Thus, if root is not rank (size - 1), then we must handle
       1. MPI_IN_PLACE option on real root, and 
       2. we must allocate temporary recvbuf on rank (size - 1).
       Note that generic function must be careful not to switch order of 
       operations for non-commutative ops.
    */
    io_root = size - 1;
    use_this_sendbuf = sendbuf;
    use_this_recvbuf = recvbuf;
    if (io_root != root) {
        ptrdiff_t text, ext;
        char *tmpbuf = NULL;
    
        ext=smpi_datatype_get_extent(datatype);
        text=smpi_datatype_get_extent(datatype);

        if ((root == rank) && (MPI_IN_PLACE == sendbuf)) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            smpi_datatype_copy (
                                                (char*)recvbuf, count, datatype,
                                                (char*)tmpbuf, count, datatype);
            use_this_sendbuf = tmpbuf;
        } else if (io_root == rank) {
            tmpbuf = (char *) malloc(text + (count - 1) * ext);
            if (NULL == tmpbuf) {
                return MPI_ERR_INTERN;
            }
            use_this_recvbuf = tmpbuf;
        }
    }

    /* Use generic reduce with in-order binary tree topology and io_root */
    ret = smpi_coll_tuned_ompi_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype,
                                          op, io_root, comm, 
                                          ompi_coll_tuned_topo_build_in_order_bintree(comm), 
                                          segcount, 0 );
    if (MPI_SUCCESS != ret) { return ret; }

    /* Clean up */
    if (io_root != root) {
        if (root == rank) {
            /* Receive result from rank io_root to recvbuf */
            smpi_mpi_recv(recvbuf, count, datatype, io_root,
                                    COLL_TAG_REDUCE, comm,
                                    MPI_STATUS_IGNORE);
            if (MPI_IN_PLACE == sendbuf) {
                free(use_this_sendbuf);
            }
          
        } else if (io_root == rank) {
            /* Send result from use_this_recvbuf to root */
            smpi_mpi_send(use_this_recvbuf, count, datatype, root,
                                    COLL_TAG_REDUCE,
                                    comm);
            free(use_this_recvbuf);
        }
    }

    return MPI_SUCCESS;
}
/**
 * Alltoall Bruck
 *
 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
 * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not
 * less...
 **/
int smpi_coll_tuned_alltoallv_bruck(void *sendbuf, int *sendcounts, int *senddisps,
                                   MPI_Datatype sendtype, void *recvbuf,
                                   int *recvcounts, int *recvdisps, MPI_Datatype recvtype,
                                   MPI_Comm comm)
{
  int system_tag = COLL_TAG_ALLTOALLV;
  int i, rank, size, err, count;
  MPI_Aint lb;
  MPI_Aint sendext = 0;
  MPI_Aint recvext = 0;
  MPI_Request *requests;

  // FIXME: check implementation
  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank);

  smpi_datatype_extent(sendtype, &lb, &sendext);
  smpi_datatype_extent(recvtype, &lb, &recvext);
  /* Local copy from self */
  err =
      smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext,
                         sendcounts[rank], sendtype,
                         (char *)recvbuf + recvdisps[rank] * recvext,
                         recvcounts[rank], recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */

      int bblock = 4;//MPIR_PARAM_ALLTOALL_THROTTLE
      //if (bblock == 0) bblock = comm_size;


     // requests = xbt_new(MPI_Request, 2 * (bblock - 1));
      int ii, ss, dst;
      /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */
      for (ii=0; ii<size; ii+=bblock) {
          requests = xbt_new(MPI_Request, 2 * (bblock ));

          ss = size-ii < bblock ? size-ii : bblock;
          count = 0;

          /* do the communication -- post ss sends and receives: */
          for ( i=0; i<ss; i++ ) {
            dst = (rank+i+ii) % size;
              if (dst == rank) {
                XBT_DEBUG("<%d> skip request creation [src = %d, recvcount = %d]",
                       rank, i, recvcounts[dst]);
                continue;
              }

              requests[count]=smpi_mpi_irecv((char *)recvbuf + recvdisps[dst] * recvext, recvcounts[dst],
                                  recvtype, dst, system_tag, comm );
              count++;
            }
            /* Now create all sends  */
          for ( i=0; i<ss; i++ ) {
              dst = (rank-i-ii+size) % size;
              if (dst == rank) {
                XBT_DEBUG("<%d> skip request creation [dst = %d, sendcount = %d]",
                       rank, i, sendcounts[dst]);
                continue;
              }
              requests[count]=smpi_mpi_isend((char *)sendbuf + senddisps[dst] * sendext, sendcounts[dst],
                                  sendtype, dst, system_tag, comm);
              count++;
            }
            /* Wait for them all. */
            //smpi_mpi_startall(count, requests);
            XBT_DEBUG("<%d> wait for %d requests", rank, count);
            smpi_mpi_waitall(count, requests, MPI_STATUSES_IGNORE);
            xbt_free(requests);

          }

  }
  return MPI_SUCCESS;
}
/*****************************************************************************
 * Function: allgather_mpich_ring
 * return: int
 * inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 ****************************************************************************/
int
smpi_coll_tuned_allgatherv_mpich_ring(void *sendbuf, int sendcount,
    MPI_Datatype send_type, void *recvbuf,
    int *recvcounts, int *displs, MPI_Datatype recvtype,
    MPI_Comm comm)
{

  char * sbuf = NULL, * rbuf = NULL;
  int soffset, roffset;
  int torecv=0, tosend=0, min, rank, comm_size;
  int sendnow, recvnow;
  int sidx, ridx;
  MPI_Status status;
  MPI_Aint recvtype_extent;
  int right, left, total_count, i;
  rank= smpi_comm_rank(comm);
  comm_size=smpi_comm_size(comm);

  recvtype_extent= smpi_datatype_get_extent( recvtype);
  total_count = 0;
  for (i=0; i<comm_size; i++)
    total_count += recvcounts[i];

  if (sendbuf != MPI_IN_PLACE) {
      /* First, load the "local" version in the recvbuf. */
      smpi_datatype_copy(sendbuf, sendcount, send_type,
          ((char *)recvbuf + displs[rank]*recvtype_extent),
          recvcounts[rank], recvtype);
  }

  left  = (comm_size + rank - 1) % comm_size;
  right = (rank + 1) % comm_size;

  torecv = total_count - recvcounts[rank];
  tosend = total_count - recvcounts[right];

  min = recvcounts[0];
  for (i = 1; i < comm_size; i++)
    if (min > recvcounts[i])
      min = recvcounts[i];
  if (min * recvtype_extent < 32768*8)
    min = 32768*8 / recvtype_extent;
  /* Handle the case where the datatype extent is larger than
   * the pipeline size. */
  if (!min)
    min = 1;

  sidx = rank;
  ridx = left;
  soffset = 0;
  roffset = 0;
  while (tosend || torecv) { /* While we have data to send or receive */
      sendnow = ((recvcounts[sidx] - soffset) > min) ? min : (recvcounts[sidx] - soffset);
      recvnow = ((recvcounts[ridx] - roffset) > min) ? min : (recvcounts[ridx] - roffset);
      sbuf = (char *)recvbuf + ((displs[sidx] + soffset) * recvtype_extent);
      rbuf = (char *)recvbuf + ((displs[ridx] + roffset) * recvtype_extent);

      /* Protect against wrap-around of indices */
      if (!tosend)
        sendnow = 0;
      if (!torecv)
        recvnow = 0;

      /* Communicate */
      if (!sendnow && !recvnow) {
          /* Don't do anything. This case is possible if two
           * consecutive processes contribute 0 bytes each. */
      }
      else if (!sendnow) { /* If there's no data to send, just do a recv call */
          smpi_mpi_recv(rbuf, recvnow, recvtype, left, COLL_TAG_ALLGATHERV, comm, &status);

          torecv -= recvnow;
      }
      else if (!recvnow) { /* If there's no data to receive, just do a send call */
          smpi_mpi_send(sbuf, sendnow, recvtype, right, COLL_TAG_ALLGATHERV, comm);

          tosend -= sendnow;
      }
      else { /* There's data to be sent and received */
          smpi_mpi_sendrecv(sbuf, sendnow, recvtype, right, COLL_TAG_ALLGATHERV,
              rbuf, recvnow, recvtype, left, COLL_TAG_ALLGATHERV,
              comm, &status);
          tosend -= sendnow;
          torecv -= recvnow;
      }

      soffset += sendnow;
      roffset += recvnow;
      if (soffset == recvcounts[sidx]) {
          soffset = 0;
          sidx = (sidx + comm_size - 1) % comm_size;
      }
      if (roffset == recvcounts[ridx]) {
          roffset = 0;
          ridx = (ridx + comm_size - 1) % comm_size;
      }
  }

  return MPI_SUCCESS;
}
int 
smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count,
                                               MPI_Datatype dtype,
                                               MPI_Op op,
                                               MPI_Comm comm) 
{
   int ret = MPI_SUCCESS;
   int line;
   int k, recv_from, send_to;
   int early_blockcount, late_blockcount, split_rank; 
   int segcount, max_segcount;
   int num_phases, phase;
   int block_count;
   unsigned int inbi;
   size_t typelng;
   char *tmpsend = NULL, *tmprecv = NULL;
   char *inbuf[2] = {NULL, NULL};
   ptrdiff_t true_extent, extent;
   ptrdiff_t block_offset, max_real_segsize;
   MPI_Request reqs[2] = {NULL, NULL};
   const size_t segsize = 1 << 20; /* 1 MB */
   unsigned int size = smpi_comm_size(comm);
   unsigned int rank = smpi_comm_rank(comm);

   XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count);

   /* Special case for size == 1 */
   if (1 == size) {
      if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
      }
      return MPI_SUCCESS;
   }
   
   /* Determine segment count based on the suggested segment size */
   extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   true_extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   typelng = smpi_datatype_size(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   segcount = count;
   COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)

   /* Special case for count less than size * segcount - use regular ring */
   if (count < size * segcount) {
      XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count);
      return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, 
                                                   comm));
   }

   /* Determine the number of phases of the algorithm */
   num_phases = count / (size * segcount);
   if ((count % (size * segcount) >= size) && 
       (count % (size * segcount) > ((size * segcount) / 2))) {
      num_phases++;
   }

   /* Determine the number of elements per block and corresponding 
      block sizes.
      The blocks are divided into "early" and "late" ones:
      blocks 0 .. (split_rank - 1) are "early" and 
      blocks (split_rank) .. (size - 1) are "late".
      Early blocks are at most 1 element larger than the late ones.
      Note, these blocks will be split into num_phases segments,
      out of the largest one will have max_segcount elements.
    */
   COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, 
                                  early_blockcount, late_blockcount )
   COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
                                  max_segcount, k)
   max_real_segsize = true_extent + (max_segcount - 1) * extent;

   /* Allocate and initialize temporary buffers */
   inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize);
   if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
   if (size > 2) {
      inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize);
      if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
   }

   /* Handle MPI_IN_PLACE */
   if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
      if (ret < 0) { line = __LINE__; goto error_hndl; }
   }

   /* Computation loop: for each phase, repeat ring allreduce computation loop */
   for (phase = 0; phase < num_phases; phase ++) {
      ptrdiff_t phase_offset;
      int early_phase_segcount, late_phase_segcount, split_phase, phase_count;

      /* 
         For each of the remote nodes:
         - post irecv for block (r-1)
         - send block (r)
           To do this, first compute block offset and count, and use block offset
           to compute phase offset.
         - in loop for every step k = 2 .. n
           - post irecv for block (r + n - k) % n
           - wait on block (r + n - k + 1) % n to arrive
           - compute on block (r + n - k + 1) % n
           - send block (r + n - k + 1) % n
         - wait on block (r + 1)
         - compute on block (r + 1)
         - send block (r + 1) to rank (r + 1)
         Note that we must be careful when computing the begining of buffers and
         for send operations and computation we must compute the exact block size.
      */
      send_to = (rank + 1) % size;
      recv_from = (rank + size - 1) % size;
      
      inbi = 0;
      /* Initialize first receive from the neighbor on the left */
      reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
      /* Send first block (my block) to the neighbor on the right:
         - compute my block and phase offset
         - send data */
      block_offset = ((rank < split_rank)? 
                      (rank * early_blockcount) : 
                      (rank * late_blockcount + split_rank));
      block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_mpi_send(tmpsend, phase_count, dtype, send_to,
                              666, comm);
      
      for (k = 2; k < size; k++) {
         const int prevblock = (rank + size - k + 1) % size;
         
         inbi = inbi ^ 0x1;
         
         /* Post irecv for the current block */
         reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
         
         /* Wait on previous block to arrive */
         smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
         
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         block_offset = ((prevblock < split_rank)?
                         (prevblock * early_blockcount) :
                         (prevblock * late_blockcount + split_rank));
         block_count = ((prevblock < split_rank)? 
                        early_blockcount : late_blockcount);
         COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                       early_phase_segcount, late_phase_segcount)
         phase_count = ((phase < split_phase)?
                        (early_phase_segcount) : (late_phase_segcount));
         phase_offset = ((phase < split_phase)?
                         (phase * early_phase_segcount) : 
                         (phase * late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
         smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype);
         /* send previous block to send_to */
         smpi_mpi_send(tmprecv, phase_count, dtype, send_to,
                              666, comm);
      }
      
      /* Wait on the last block to arrive */
      smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE);

      
      /* Apply operation on the last block (from neighbor (rank + 1) 
         rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
      recv_from = (rank + 1) % size;
      block_offset = ((recv_from < split_rank)?
                      (recv_from * early_blockcount) :
                      (recv_from * late_blockcount + split_rank));
      block_count = ((recv_from < split_rank)? 
                     early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype);
   }

   /* Distribution loop - variation of ring allgather */
   send_to = (rank + 1) % size;
   recv_from = (rank + size - 1) % size;
   for (k = 0; k < size - 1; k++) {
      const int recv_data_from = (rank + size - k) % size;
      const int send_data_from = (rank + 1 + size - k) % size;
      const int send_block_offset = 
         ((send_data_from < split_rank)?
          (send_data_from * early_blockcount) :
          (send_data_from * late_blockcount + split_rank));
      const int recv_block_offset = 
         ((recv_data_from < split_rank)?
          (recv_data_from * early_blockcount) :
          (recv_data_from * late_blockcount + split_rank));
      block_count = ((send_data_from < split_rank)? 
                     early_blockcount : late_blockcount);

      tmprecv = (char*)rbuf + recv_block_offset * extent;
      tmpsend = (char*)rbuf + send_block_offset * extent;

      smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to,
                                     666,
                                     tmprecv, early_blockcount, dtype, recv_from,
                                     666,
                                     comm, MPI_STATUS_IGNORE);

   }

   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);

   return MPI_SUCCESS;

 error_hndl:
   XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n",
                __FILE__, line, rank, ret);
   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);
   return ret;
}
Exemple #10
0
int smpi_coll_tuned_reduce_mvapich2_knomial (
        void *sendbuf,
        void *recvbuf,
        int count,
        MPI_Datatype datatype,
        MPI_Op op,
        int root,
        MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int rank, is_commutative;
    int src, k;
    MPI_Request send_request;
    int index=0;
    MPI_Aint true_lb, true_extent, extent;
    MPI_Status status; 
    int recv_iter=0, dst=-1, expected_send_count, expected_recv_count;
    int *src_array=NULL;
    void **tmp_buf=NULL;
    MPI_Request *requests=NULL;


    if (count == 0) return MPI_SUCCESS;

    rank = smpi_comm_rank(comm);

    /* Create a temporary buffer */

    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    is_commutative = smpi_op_is_commute(op);

    if (rank != root) {
        recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent)));
        recvbuf = (void *)((char*)recvbuf - true_lb);
    }

    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
        mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf,
                count, datatype);
    }


    if(mv2_reduce_intra_knomial_factor<0)
      {
        mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }
    if(mv2_reduce_inter_knomial_factor<0)
      {
        mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }


    MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, 
           &dst, &expected_send_count, &expected_recv_count, &src_array);

    if(expected_recv_count > 0 ) {
        tmp_buf  = xbt_malloc(sizeof(void *)*expected_recv_count);
        requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count);
        for(k=0; k < expected_recv_count; k++ ) {
            tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent)));
            tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb);
        }

        while(recv_iter  < expected_recv_count) {
            src = src_array[expected_recv_count - (recv_iter+1)];

            requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src,
                    COLL_TAG_REDUCE, comm);
            recv_iter++;

        }

        recv_iter=0;
        while(recv_iter < expected_recv_count) {
            index=smpi_mpi_waitany(expected_recv_count, requests,
                    &status);
            recv_iter++;

            if (is_commutative) {
              smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype);
            }
        }

        for(k=0; k < expected_recv_count; k++ ) {
            smpi_free_tmp_buffer(tmp_buf[k]);
        }
        xbt_free(tmp_buf);
        xbt_free(requests);
    }

    if(src_array != NULL) { 
        xbt_free(src_array);
    } 

    if(rank != root) {
        send_request=smpi_mpi_isend(recvbuf,count, datatype, dst,
                COLL_TAG_REDUCE,comm);

        smpi_mpi_waitall(1, &send_request, &status);

        smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb));
    }

    /* --END ERROR HANDLING-- */

    return mpi_errno;
}
int smpi_coll_tuned_reduce_scatter_mpich_pair(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf;
    int mpi_errno = MPI_SUCCESS;
    int total_count, dst, src;
    int is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
    if (total_count == 0) {
        return MPI_ERR_COUNT;
    }

        if (sendbuf != MPI_IN_PLACE) {
            /* copy local data into recvbuf */
            smpi_datatype_copy(((char *)sendbuf+disps[rank]*extent),
                                       recvcounts[rank], datatype, recvbuf,
                                       recvcounts[rank], datatype);
        }
        
        /* allocate temporary buffer to store incoming data */
        tmp_recvbuf = (void*)xbt_malloc(recvcounts[rank]*(max(true_extent,extent))+1);
        /* adjust for potential negative lower bound in datatype */
        tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);
        
        for (i=1; i<comm_size; i++) {
            src = (rank - i + comm_size) % comm_size;
            dst = (rank + i) % comm_size;
            
            /* send the data that dst needs. recv data that this process
               needs from src into tmp_recvbuf */
            if (sendbuf != MPI_IN_PLACE) 
                smpi_mpi_sendrecv(((char *)sendbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            else
                smpi_mpi_sendrecv(((char *)recvbuf+disps[dst]*extent), 
                                             recvcounts[dst], datatype, dst,
                                             COLL_TAG_SCATTER, tmp_recvbuf,
                                             recvcounts[rank], datatype, src,
                                             COLL_TAG_SCATTER, comm,
                                             MPI_STATUS_IGNORE);
            
            if (is_commutative || (src < rank)) {
                if (sendbuf != MPI_IN_PLACE) {
		     smpi_op_apply( op,
			                          tmp_recvbuf, recvbuf, &recvcounts[rank],
                               &datatype); 
                }
                else {
		    smpi_op_apply(op, 
			tmp_recvbuf, ((char *)recvbuf+disps[rank]*extent), 
			&recvcounts[rank], &datatype);
                    /* we can't store the result at the beginning of
                       recvbuf right here because there is useful data
                       there that other process/processes need. at the
                       end, we will copy back the result to the
                       beginning of recvbuf. */
                }
            }
            else {
                if (sendbuf != MPI_IN_PLACE) {
		    smpi_op_apply(op, 
		       recvbuf, tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, recvbuf,
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
                else {
		    smpi_op_apply(op, 
                        ((char *)recvbuf+disps[rank]*extent),
			tmp_recvbuf, &recvcounts[rank], &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_recvbuf, recvcounts[rank],
                                               datatype, 
                                               ((char *)recvbuf +
                                                disps[rank]*extent), 
                                               recvcounts[rank], datatype);
                    if (mpi_errno) return(mpi_errno);
                }
            }
        }
        
        /* if MPI_IN_PLACE, move output data to the beginning of
           recvbuf. already done for rank 0. */
        if ((sendbuf == MPI_IN_PLACE) && (rank != 0)) {
            mpi_errno = smpi_datatype_copy(((char *)recvbuf +
                                        disps[rank]*extent),  
                                       recvcounts[rank], datatype,
                                       recvbuf, 
                                       recvcounts[rank], datatype );
            if (mpi_errno) return(mpi_errno);
        }
    
return MPI_SUCCESS;
}
int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int   rank, comm_size, i;
    MPI_Aint extent, true_extent, true_lb; 
    int  *disps;
    void *tmp_recvbuf, *tmp_results;
    int mpi_errno = MPI_SUCCESS;
    int dis[2], blklens[2], total_count, dst;
    int mask, dst_tree_root, my_tree_root, j, k;
    int received;
    MPI_Datatype sendtype, recvtype;
    int nprocs_completed, tmp_mask, tree_root, is_commutative;
    comm_size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    extent =smpi_datatype_get_extent(datatype);
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    
    if (smpi_op_is_commute(op)) {
        is_commutative = 1;
    }

    disps = (int*)xbt_malloc( comm_size * sizeof(int));

    total_count = 0;
    for (i=0; i<comm_size; i++) {
        disps[i] = total_count;
        total_count += recvcounts[i];
    }
    
            /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */

            /* need to allocate temporary buffer to receive incoming data*/
            tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb);

            /* need to allocate another temporary buffer to accumulate
               results */
            tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent)));
            /* adjust for potential negative lower bound in datatype */
            tmp_results = (void *)((char*)tmp_results - true_lb);

            /* copy sendbuf into tmp_results */
            if (sendbuf != MPI_IN_PLACE)
                mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);
            else
                mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype,
                                           tmp_results, total_count, datatype);

            if (mpi_errno) return(mpi_errno);

            mask = 0x1;
            i = 0;
            while (mask < comm_size) {
                dst = rank ^ mask;

                dst_tree_root = dst >> i;
                dst_tree_root <<= i;

                my_tree_root = rank >> i;
                my_tree_root <<= i;

                /* At step 1, processes exchange (n-n/p) amount of
                   data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p)
                   amount of data, and so forth. We use derived datatypes for this.

                   At each step, a process does not need to send data
                   indexed from my_tree_root to
                   my_tree_root+mask-1. Similarly, a process won't receive
                   data indexed from dst_tree_root to dst_tree_root+mask-1. */

                /* calculate sendtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<my_tree_root; j++)
                    blklens[0] += recvcounts[j];
                for (j=my_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&sendtype);

                /* calculate recvtype */
                blklens[0] = blklens[1] = 0;
                for (j=0; j<dst_tree_root && j<comm_size; j++)
                    blklens[0] += recvcounts[j];
                for (j=dst_tree_root+mask; j<comm_size; j++)
                    blklens[1] += recvcounts[j];

                dis[0] = 0;
                dis[1] = blklens[0];
                for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++)
                    dis[1] += recvcounts[j];

                mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype);
                if (mpi_errno) return(mpi_errno);
                
                smpi_datatype_commit(&recvtype);

                received = 0;
                if (dst < comm_size) {
                    /* tmp_results contains data to be sent in each step. Data is
                       received in tmp_recvbuf and then accumulated into
                       tmp_results. accumulation is done later below.   */ 

                    smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst,
                                                 COLL_TAG_SCATTER,
                                                 tmp_recvbuf, 1, recvtype, dst,
                                                 COLL_TAG_SCATTER, comm,
                                                 MPI_STATUS_IGNORE);
                    received = 1;
                }

                /* if some processes in this process's subtree in this step
                   did not have any destination process to communicate with
                   because of non-power-of-two, we need to send them the
                   result. We use a logarithmic recursive-halfing algorithm
                   for this. */

                if (dst_tree_root + mask > comm_size) {
                    nprocs_completed = comm_size - my_tree_root - mask;
                    /* nprocs_completed is the number of processes in this
                       subtree that have all the data. Send data to others
                       in a tree fashion. First find root of current tree
                       that is being divided into two. k is the number of
                       least-significant bits in this process's rank that
                       must be zeroed out to find the rank of the root */ 
                    j = mask;
                    k = 0;
                    while (j) {
                        j >>= 1;
                        k++;
                    }
                    k--;

                    tmp_mask = mask >> 1;
                    while (tmp_mask) {
                        dst = rank ^ tmp_mask;

                        tree_root = rank >> k;
                        tree_root <<= k;

                        /* send only if this proc has data and destination
                           doesn't have data. at any step, multiple processes
                           can send if they have the data */
                        if ((dst > rank) && 
                            (rank < tree_root + nprocs_completed)
                            && (dst >= tree_root + nprocs_completed)) {
                            /* send the current result */
                            smpi_mpi_send(tmp_recvbuf, 1, recvtype,
                                                     dst, COLL_TAG_SCATTER,
                                                     comm);
                        }
                        /* recv only if this proc. doesn't have data and sender
                           has data */
                        else if ((dst < rank) && 
                                 (dst < tree_root + nprocs_completed) &&
                                 (rank >= tree_root + nprocs_completed)) {
                            smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst,
                                                     COLL_TAG_SCATTER,
                                                     comm, MPI_STATUS_IGNORE); 
                            received = 1;
                        }
                        tmp_mask >>= 1;
                        k--;
                    }
                }

                /* The following reduction is done here instead of after 
                   the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is
                   because to do it above, in the noncommutative 
                   case, we would need an extra temp buffer so as not to
                   overwrite temp_recvbuf, because temp_recvbuf may have
                   to be communicated to other processes in the
                   non-power-of-two case. To avoid that extra allocation,
                   we do the reduce here. */
                if (received) {
                    if (is_commutative || (dst_tree_root < my_tree_root)) {
                        {
			         smpi_op_apply(op, 
                               tmp_recvbuf, tmp_results, &blklens[0],
			       &datatype); 
			        smpi_op_apply(op, 
                               ((char *)tmp_recvbuf + dis[1]*extent),
			       ((char *)tmp_results + dis[1]*extent),
			       &blklens[1], &datatype); 
                        }
                    }
                    else {
                        {
			         smpi_op_apply(op,
                                   tmp_results, tmp_recvbuf, &blklens[0],
                                   &datatype); 
			         smpi_op_apply(op,
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }
int smpi_coll_tuned_reduce_scatter_mpich_noncomm(void *sendbuf, void *recvbuf, int recvcounts[],
                              MPI_Datatype datatype, MPI_Op op, MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int comm_size = smpi_comm_size(comm) ;
    int rank = smpi_comm_rank(comm);
    int pof2;
    int log2_comm_size;
    int i, k;
    int recv_offset, send_offset;
    int block_size, total_count, size;
    MPI_Aint true_extent, true_lb;
    int buf0_was_inout;
    void *tmp_buf0;
    void *tmp_buf1;
    void *result_ptr;

    smpi_datatype_extent(datatype, &true_lb, &true_extent);

    pof2 = 1;
    log2_comm_size = 0;
    while (pof2 < comm_size) {
        pof2 <<= 1;
        ++log2_comm_size;
    }

    /* begin error checking */
    xbt_assert(pof2 == comm_size); /* FIXME this version only works for power of 2 procs */

    for (i = 0; i < (comm_size - 1); ++i) {
        xbt_assert(recvcounts[i] == recvcounts[i+1]);
    }
    /* end error checking */

    /* size of a block (count of datatype per block, NOT bytes per block) */
    block_size = recvcounts[0];
    total_count = block_size * comm_size;

    tmp_buf0=( void *)xbt_malloc( true_extent * total_count);
    tmp_buf1=( void *)xbt_malloc( true_extent * total_count);
    /* adjust for potential negative lower bound in datatype */
    tmp_buf0 = (void *)((char*)tmp_buf0 - true_lb);
    tmp_buf1 = (void *)((char*)tmp_buf1 - true_lb);

    /* Copy our send data to tmp_buf0.  We do this one block at a time and
       permute the blocks as we go according to the mirror permutation. */
    for (i = 0; i < comm_size; ++i) {
        mpi_errno = smpi_datatype_copy((char *)(sendbuf == MPI_IN_PLACE ? recvbuf : sendbuf) + (i * true_extent * block_size), block_size, datatype,
                                   (char *)tmp_buf0 + (MPIU_Mirror_permutation(i, log2_comm_size) * true_extent * block_size), block_size, datatype);
        if (mpi_errno) return(mpi_errno);
    }
    buf0_was_inout = 1;

    send_offset = 0;
    recv_offset = 0;
    size = total_count;
    for (k = 0; k < log2_comm_size; ++k) {
        /* use a double-buffering scheme to avoid local copies */
        char *incoming_data = (buf0_was_inout ? tmp_buf1 : tmp_buf0);
        char *outgoing_data = (buf0_was_inout ? tmp_buf0 : tmp_buf1);
        int peer = rank ^ (0x1 << k);
        size /= 2;

        if (rank > peer) {
            /* we have the higher rank: send top half, recv bottom half */
            recv_offset += size;
        }
        else {
            /* we have the lower rank: recv top half, send bottom half */
            send_offset += size;
        }

        smpi_mpi_sendrecv(outgoing_data + send_offset*true_extent,
                                     size, datatype, peer, COLL_TAG_SCATTER,
                                     incoming_data + recv_offset*true_extent,
                                     size, datatype, peer, COLL_TAG_SCATTER,
                                     comm, MPI_STATUS_IGNORE);
        /* always perform the reduction at recv_offset, the data at send_offset
           is now our peer's responsibility */
        if (rank > peer) {
            /* higher ranked value so need to call op(received_data, my_data) */
            smpi_op_apply(op, 
                   incoming_data + recv_offset*true_extent,
                     outgoing_data + recv_offset*true_extent,
                     &size, &datatype );
            /* buf0_was_inout = buf0_was_inout; */
        }
        else {
            /* lower ranked value so need to call op(my_data, received_data) */
	    smpi_op_apply( op,
		     outgoing_data + recv_offset*true_extent,
                     incoming_data + recv_offset*true_extent,
                     &size, &datatype);
            buf0_was_inout = !buf0_was_inout;
        }

        /* the next round of send/recv needs to happen within the block (of size
           "size") that we just received and reduced */
        send_offset = recv_offset;
    }

    xbt_assert(size == recvcounts[rank]);

    /* copy the reduced data to the recvbuf */
    result_ptr = (char *)(buf0_was_inout ? tmp_buf0 : tmp_buf1) + recv_offset * true_extent;
    mpi_errno = smpi_datatype_copy(result_ptr, size, datatype,
                               recvbuf, size, datatype);
    if (mpi_errno) return(mpi_errno);
    return MPI_SUCCESS;
}
                                   ((char *)tmp_results + dis[1]*extent),
                                   ((char *)tmp_recvbuf + dis[1]*extent),
                                   &blklens[1], &datatype); 
                        }
                        /* copy result back into tmp_results */
                        mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, 
                                                   tmp_results, 1, recvtype);
                        if (mpi_errno) return(mpi_errno);
                    }
                }

                //smpi_datatype_free(&sendtype);
                //smpi_datatype_free(&recvtype);

                mask <<= 1;
                i++;
            }

            /* now copy final results from tmp_results to recvbuf */
            mpi_errno = smpi_datatype_copy(((char *)tmp_results+disps[rank]*extent),
                                       recvcounts[rank], datatype, recvbuf,
                                       recvcounts[rank], datatype);
            if (mpi_errno) return(mpi_errno);
    xbt_free(disps);
    xbt_free(tmp_recvbuf);
    xbt_free(tmp_results);
    return MPI_SUCCESS;
        }


int smpi_coll_tuned_reduce_mvapich2_two_level( void *sendbuf,
                                     void *recvbuf,
                                     int count,
                                     MPI_Datatype datatype,
                                     MPI_Op op,
                                     int root,
                                     MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int my_rank, total_size, local_rank, local_size;
    int leader_comm_rank = -1, leader_comm_size = 0;
    MPI_Comm shmem_comm, leader_comm;
    int leader_root, leader_of_root;
    void *in_buf = NULL, *out_buf = NULL, *tmp_buf = NULL;
    MPI_Aint true_lb, true_extent, extent;
    int is_commutative = 0, stride = 0;
    int intra_node_root=0; 
    
    //if not set (use of the algo directly, without mvapich2 selector)
    if(MV2_Reduce_function==NULL)
      MV2_Reduce_function=smpi_coll_tuned_reduce_mpich;
    if(MV2_Reduce_intra_function==NULL)
      MV2_Reduce_intra_function=smpi_coll_tuned_reduce_mpich;

    if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
      smpi_comm_init_smp(comm);
    }
  
    my_rank = smpi_comm_rank(comm);
    total_size = smpi_comm_size(comm);
    shmem_comm = smpi_comm_get_intra_comm(comm);
    local_rank = smpi_comm_rank(shmem_comm);
    local_size = smpi_comm_size(shmem_comm);
    
    leader_comm = smpi_comm_get_leaders_comm(comm);
    int* leaders_map = smpi_comm_get_leaders_map(comm);
    leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]);
    leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]);

    is_commutative=smpi_op_is_commute(op);

    smpi_datatype_extent(datatype, &true_lb,
                                       &true_extent);
    extent =smpi_datatype_get_extent(datatype);
    stride = count * MAX(extent, true_extent);

    if (local_size == total_size) {
        /* First handle the case where there is only one node */
        if (stride <= MV2_INTRA_SHMEM_REDUCE_MSG &&
            is_commutative == 1) {
            if (local_rank == 0 ) {
                tmp_buf=(void *)smpi_get_tmp_sendbuffer( count *
                                    (MAX(extent, true_extent)));
                tmp_buf = (void *) ((char *) tmp_buf - true_lb);
            }

            if (sendbuf != MPI_IN_PLACE) {
                in_buf = (void *)sendbuf;
            } else {
                in_buf = recvbuf;
            }

            if (local_rank == 0) { 
                 if( my_rank != root) {
                     out_buf = tmp_buf;
                 } else { 
                     out_buf = recvbuf; 
                     if(in_buf == out_buf) { 
                        in_buf = MPI_IN_PLACE; 
                        out_buf = recvbuf; 
                     } 
                 } 
            } else {
                in_buf  = (void *)sendbuf; 
                out_buf = NULL;
            }

	    if (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE) {
		mpi_errno = MPIR_Reduce_shmem_MV2(in_buf, out_buf, count,
						  datatype, op,
						  0, shmem_comm);
	    }
	    else {
		mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
								  datatype, op,
								  0, shmem_comm);
	    }
	    
            if (local_rank == 0 && root != my_rank) {
                smpi_mpi_send(out_buf, count, datatype, root,
                                         COLL_TAG_REDUCE+1, comm);
            }
            if ((local_rank != 0) && (root == my_rank)) {
                smpi_mpi_recv(recvbuf, count, datatype,
                                         leader_of_root, COLL_TAG_REDUCE+1, comm,
                                         MPI_STATUS_IGNORE);
            }
        } else {
            if(mv2_use_knomial_reduce == 1) { 
                reduce_fn = &MPIR_Reduce_intra_knomial_wrapper_MV2; 
            } else { 
                reduce_fn = &MPIR_Reduce_binomial_MV2; 
            } 
            mpi_errno = reduce_fn(sendbuf, recvbuf, count,
                                  datatype, op,
                                  root, comm);
        }
        /* We are done */
        if(tmp_buf!=NULL) 
          smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
        goto fn_exit;
    }
    

    if (local_rank == 0) {
        leader_comm = smpi_comm_get_leaders_comm(comm);
        if(leader_comm==MPI_COMM_NULL){
          leader_comm = MPI_COMM_WORLD;
        }
        leader_comm_size = smpi_comm_size(leader_comm);
        leader_comm_rank = smpi_comm_rank(leader_comm);
        tmp_buf=(void *)smpi_get_tmp_sendbuffer(count *
                            (MAX(extent, true_extent)));
        tmp_buf = (void *) ((char *) tmp_buf - true_lb);
    }
    if (sendbuf != MPI_IN_PLACE) {
        in_buf = (void *)sendbuf;
    } else {
        in_buf = recvbuf;
    }
    if (local_rank == 0) {
        out_buf = tmp_buf;
    } else {
        out_buf = NULL;
    }


    if(local_size > 1) { 
        /* Lets do the intra-node reduce operations, if we have more than one
         * process in the node */

        /*Fix the input and outbuf buffers for the intra-node reduce.
         *Node leaders will have the reduced data in tmp_buf after 
         *this step*/
        if (MV2_Reduce_intra_function == & MPIR_Reduce_shmem_MV2)
        {
            if (is_commutative == 1
		&& (count * (MAX(extent, true_extent)) < SHMEM_COLL_BLOCK_SIZE)) {
                    mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            } else {
                    mpi_errno = MPIR_Reduce_intra_knomial_wrapper_MV2(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
            }
        } else {

            mpi_errno = MV2_Reduce_intra_function(in_buf, out_buf, count,
                                      datatype, op,
                                      intra_node_root, shmem_comm);
        }
    } else { 
        smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));
        tmp_buf = in_buf; 
    } 

    /* Now work on the inter-leader phase. Data is in tmp_buf */
    if (local_rank == 0 && leader_comm_size > 1) {
        /*The leader of root will have the global reduced data in tmp_buf 
           or recv_buf
           at the end of the reduce */
        if (leader_comm_rank == leader_root) {
            if (my_rank == root) {
                /* I am the root of the leader-comm, and the 
                 * root of the reduce op. So, I will write the 
                 * final result directly into my recvbuf */
                if(tmp_buf != recvbuf) { 
                    in_buf = tmp_buf;
                    out_buf = recvbuf;
                } else { 

                     in_buf = (char *)smpi_get_tmp_sendbuffer(count*
                                       smpi_datatype_get_extent(datatype));
                     smpi_datatype_copy(tmp_buf, count, datatype,
                                        in_buf, count, datatype);
                    //in_buf = MPI_IN_PLACE; 
                    out_buf = recvbuf; 
                } 
            } else {
                in_buf = (char *)smpi_get_tmp_sendbuffer(count*
                                       smpi_datatype_get_extent(datatype));
                smpi_datatype_copy(tmp_buf, count, datatype,
                                        in_buf, count, datatype);
                //in_buf = MPI_IN_PLACE;
                out_buf = tmp_buf;
            }
        } else {
            in_buf = tmp_buf;
            out_buf = NULL;
        }

        /* inter-leader communication  */
        mpi_errno = MV2_Reduce_function(in_buf, out_buf, count,
                              datatype, op,
                              leader_root, leader_comm);

    }

    if (local_size > 1) {
        /* Send the message to the root if the leader is not the
         * root of the reduce operation. The reduced data is in tmp_buf */
        if ((local_rank == 0) && (root != my_rank)
            && (leader_root == leader_comm_rank)) {
            smpi_mpi_send(tmp_buf, count, datatype, root,
                                     COLL_TAG_REDUCE+1, comm);
        }
        if ((local_rank != 0) && (root == my_rank)) {
            smpi_mpi_recv(recvbuf, count, datatype,
                                     leader_of_root,
                                     COLL_TAG_REDUCE+1, comm,
                                     MPI_STATUS_IGNORE);
        }
      smpi_free_tmp_buffer((void *) ((char *) tmp_buf + true_lb));

      if (leader_comm_rank == leader_root) {
        if (my_rank != root || (my_rank == root && tmp_buf == recvbuf)) { 
          smpi_free_tmp_buffer(in_buf);
        }
      }
    }



  fn_exit:
    return mpi_errno;
}
int smpi_coll_tuned_allreduce_rab_rdb(void *sbuff, void *rbuff, int count,
                                      MPI_Datatype dtype, MPI_Op op,
                                      MPI_Comm comm)
{
  int tag = COLL_TAG_ALLREDUCE;
  unsigned int mask, pof2;
  int dst, newrank, rem, newdst, i,
      send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
  MPI_Aint extent;
  MPI_Status status;
  void *tmp_buf = NULL;

  unsigned int nprocs = smpi_comm_size(comm);
  unsigned int rank = smpi_comm_rank(comm);

  extent = smpi_datatype_get_extent(dtype);
  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);

  smpi_datatype_copy(sbuff, count, dtype, rbuff, count, dtype);

  // find nearest power-of-two less than or equal to comm_size
  pof2 = 1;
  while (pof2 <= nprocs)
    pof2 <<= 1;
  pof2 >>= 1;

  rem = nprocs - pof2;

  // In the non-power-of-two case, all even-numbered
  // processes of rank < 2*rem send their data to
  // (rank+1). These even-numbered processes no longer
  // participate in the algorithm until the very end. The
  // remaining processes form a nice power-of-two. 

  if (rank < 2 * rem) {
    // even       
    if (rank % 2 == 0) {

      smpi_mpi_send(rbuff, count, dtype, rank + 1, tag, comm);

      // temporarily set the rank to -1 so that this
      // process does not pariticipate in recursive
      // doubling
      newrank = -1;
    } else                      // odd
    {
      smpi_mpi_recv(tmp_buf, count, dtype, rank - 1, tag, comm, &status);
      // do the reduction on received data. since the
      // ordering is right, it doesn't matter whether
      // the operation is commutative or not.
       smpi_op_apply(op, tmp_buf, rbuff, &count, &dtype);

      // change the rank 
      newrank = rank / 2;
    }
  }

  else                          // rank >= 2 * rem 
    newrank = rank - rem;

  // If op is user-defined or count is less than pof2, use
  // recursive doubling algorithm. Otherwise do a reduce-scatter
  // followed by allgather. (If op is user-defined,
  // derived datatypes are allowed and the user could pass basic
  // datatypes on one process and derived on another as long as
  // the type maps are the same. Breaking up derived
  // datatypes to do the reduce-scatter is tricky, therefore
  // using recursive doubling in that case.) 

  if (newrank != -1) {
    // do a reduce-scatter followed by allgather. for the
    // reduce-scatter, calculate the count that each process receives
    // and the displacement within the buffer 

    cnts = (int *) xbt_malloc(pof2 * sizeof(int));
    disps = (int *) xbt_malloc(pof2 * sizeof(int));

    for (i = 0; i < (pof2 - 1); i++)
      cnts[i] = count / pof2;
    cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

    disps[0] = 0;
    for (i = 1; i < pof2; i++)
      disps[i] = disps[i - 1] + cnts[i - 1];

    mask = 0x1;
    send_idx = recv_idx = 0;
    last_idx = pof2;
    while (mask < pof2) {
      newdst = newrank ^ mask;
      // find real rank of dest 
      dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

      send_cnt = recv_cnt = 0;
      if (newrank < newdst) {
        send_idx = recv_idx + pof2 / (mask * 2);
        for (i = send_idx; i < last_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < send_idx; i++)
          recv_cnt += cnts[i];
      } else {
        recv_idx = send_idx + pof2 / (mask * 2);
        for (i = send_idx; i < recv_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < last_idx; i++)
          recv_cnt += cnts[i];
      }

      // Send data from recvbuf. Recv into tmp_buf 
      smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
                   dtype, dst, tag,
                   (char *) tmp_buf + disps[recv_idx] * extent, recv_cnt,
                   dtype, dst, tag, comm, &status);

      // tmp_buf contains data received in this step.
      // recvbuf contains data accumulated so far 

      // This algorithm is used only for predefined ops
      // and predefined ops are always commutative.
      smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                        (char *) rbuff + disps[recv_idx] * extent, &recv_cnt, &dtype);

      // update send_idx for next iteration 
      send_idx = recv_idx;
      mask <<= 1;

      // update last_idx, but not in last iteration because the value
      // is needed in the allgather step below. 
      if (mask < pof2)
        last_idx = recv_idx + pof2 / mask;
    }

    // now do the allgather 

    mask >>= 1;
    while (mask > 0) {
      newdst = newrank ^ mask;
      // find real rank of dest
      dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

      send_cnt = recv_cnt = 0;
      if (newrank < newdst) {
        // update last_idx except on first iteration 
        if (mask != pof2 / 2)
          last_idx = last_idx + pof2 / (mask * 2);

        recv_idx = send_idx + pof2 / (mask * 2);
        for (i = send_idx; i < recv_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < last_idx; i++)
          recv_cnt += cnts[i];
      } else {
        recv_idx = send_idx - pof2 / (mask * 2);
        for (i = send_idx; i < last_idx; i++)
          send_cnt += cnts[i];
        for (i = recv_idx; i < send_idx; i++)
          recv_cnt += cnts[i];
      }

      smpi_mpi_sendrecv((char *) rbuff + disps[send_idx] * extent, send_cnt,
                   dtype, dst, tag,
                   (char *) rbuff + disps[recv_idx] * extent, recv_cnt,
                   dtype, dst, tag, comm, &status);

      if (newrank > newdst)
        send_idx = recv_idx;

      mask >>= 1;
    }

    free(cnts);
    free(disps);

  }
Exemple #17
0
int smpi_coll_tuned_reduce_binomial(void *sendbuf, void *recvbuf, int count,
                                    MPI_Datatype datatype, MPI_Op op, int root,
                                    MPI_Comm comm)
{
  MPI_Status status;
  int comm_size, rank;
  int mask, relrank, source;
  int dst;
  int tag = COLL_TAG_REDUCE;
  MPI_Aint extent;
  void *tmp_buf;
  MPI_Aint true_lb, true_extent;
  if (count == 0)
    return 0;
  rank = smpi_comm_rank(comm);
  comm_size = smpi_comm_size(comm);

  extent = smpi_datatype_get_extent(datatype);

  tmp_buf = (void *) smpi_get_tmp_sendbuffer(count * extent);
  int is_commutative = smpi_op_is_commute(op);
  mask = 1;
  
  int lroot;
  if (is_commutative) 
        lroot   = root;
  else
        lroot   = 0;
  relrank = (rank - lroot + comm_size) % comm_size;

  smpi_datatype_extent(datatype, &true_lb, &true_extent);

  /* adjust for potential negative lower bound in datatype */
  tmp_buf = (void *)((char*)tmp_buf - true_lb);
    
  /* If I'm not the root, then my recvbuf may not be valid, therefore
     I have to allocate a temporary one */
  if (rank != root) {
      recvbuf = (void *) smpi_get_tmp_recvbuffer(count*(max(extent,true_extent)));
      recvbuf = (void *)((char*)recvbuf - true_lb);
  }
   if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
      smpi_datatype_copy(sendbuf, count, datatype, recvbuf,count, datatype);
  }

  while (mask < comm_size) {
    /* Receive */
    if ((mask & relrank) == 0) {
      source = (relrank | mask);
      if (source < comm_size) {
        source = (source + lroot) % comm_size;
        smpi_mpi_recv(tmp_buf, count, datatype, source, tag, comm, &status);
        
        if (is_commutative) {
          smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
        } else {
          smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
          smpi_datatype_copy(tmp_buf, count, datatype,recvbuf, count, datatype);
        }
      }
    } else {
      dst = ((relrank & (~mask)) + lroot) % comm_size;
      smpi_mpi_send(recvbuf, count, datatype, dst, tag, comm);
      break;
    }
    mask <<= 1;
  }

  if (!is_commutative && (root != 0)){
    if (rank == 0){
      smpi_mpi_send(recvbuf, count, datatype, root,tag, comm);
    }else if (rank == root){
      smpi_mpi_recv(recvbuf, count, datatype, 0, tag, comm, &status);
    }
  }

  if (rank != root) {
	  smpi_free_tmp_buffer(recvbuf);
  }
  smpi_free_tmp_buffer(tmp_buf);

  return 0;
}
Exemple #18
0
/*****************************************************************************
 * Function: allgather_bruck
 * return: int
 * inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Function realizes the allgather operation using the bruck
 *         algorithm.
 * Auther: MPICH
 * Comment: Original bruck algorithm from MPICH is slightly modified by
 *          Ahmad Faraj.  
 ****************************************************************************/
int smpi_coll_tuned_allgather_bruck(void *send_buff, int send_count,
                                    MPI_Datatype send_type, void *recv_buff,
                                    int recv_count, MPI_Datatype recv_type,
                                    MPI_Comm comm)
{
  // MPI variables
  MPI_Status status;
  MPI_Aint recv_extent;

  // local int variables
  int src, dst, rank, num_procs, count, remainder;
  int tag = COLL_TAG_ALLGATHER;
  int pof2 = 1;

  // local string variables
  char *tmp_buff;
  char *send_ptr = (char *) send_buff;
  char *recv_ptr = (char *) recv_buff;

  // get size of the communicator, followed by rank 
  num_procs = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);

  // get size of single element's type for recv buffer
  recv_extent = smpi_datatype_get_extent(recv_type);

  count = recv_count;

  tmp_buff = (char *) xbt_malloc(num_procs * recv_count * recv_extent);

  // perform a local copy
  smpi_datatype_copy(send_ptr, send_count, send_type,
		     tmp_buff, recv_count, recv_type);
  while (pof2 <= (num_procs / 2)) {
    src = (rank + pof2) % num_procs;
    dst = (rank - pof2 + num_procs) % num_procs;

    smpi_mpi_sendrecv(tmp_buff, count, recv_type, dst, tag,
                  tmp_buff + count * recv_extent, count, recv_type,
                  src, tag, comm, &status);
    count *= 2;
    pof2 *= 2;
  }

  remainder = num_procs - pof2;
  if (remainder) {
    src = (rank + pof2) % num_procs;
    dst = (rank - pof2 + num_procs) % num_procs;

    smpi_mpi_sendrecv(tmp_buff, remainder * recv_count, recv_type, dst, tag,
                  tmp_buff + count * recv_extent, remainder * recv_count,
                  recv_type, src, tag, comm, &status);
  }

  smpi_mpi_sendrecv(tmp_buff, (num_procs - rank) * recv_count, recv_type, rank,
                tag, recv_ptr + rank * recv_count * recv_extent,
                (num_procs - rank) * recv_count, recv_type, rank, tag, comm,
                &status);

  if (rank)
    smpi_mpi_sendrecv(tmp_buff + (num_procs - rank) * recv_count * recv_extent,
                  rank * recv_count, recv_type, rank, tag, recv_ptr,
                  rank * recv_count, recv_type, rank, tag, comm, &status);
  free(tmp_buff);
  return MPI_SUCCESS;
}
Exemple #19
0
/*
 *  reduce_scatter_ompi_basic_recursivehalving
 *
 *  Function:   - reduce scatter implementation using recursive-halving 
 *                algorithm
 *  Accepts:    - same as MPI_Reduce_scatter()
 *  Returns:    - MPI_SUCCESS or error code
 *  Limitation: - Works only for commutative operations.
 */
int
smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(void *sbuf, 
                                                            void *rbuf, 
                                                            int *rcounts,
                                                            MPI_Datatype dtype,
                                                            MPI_Op op,
                                                            MPI_Comm comm
                                                            )
{
    int i, rank, size, count, err = MPI_SUCCESS;
    int tmp_size=1, remain = 0, tmp_rank, *disps = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;
   
    /* Initialize */
    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
   
    XBT_DEBUG("coll:tuned:reduce_scatter_ompi_basic_recursivehalving, rank %d", rank);

    /* Find displacements and the like */
    disps = (int*) xbt_malloc(sizeof(int) * size);
    if (NULL == disps) return MPI_ERR_OTHER;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
    }
    count = disps[size - 1] + rcounts[size - 1];

    /* short cut the trivial case */
    if (0 == count) {
        xbt_free(disps);
        return MPI_SUCCESS;
    }

    /* get datatype information */
    smpi_datatype_extent(dtype, &lb, &extent);
    smpi_datatype_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (ptrdiff_t)(count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    /* Allocate temporary receive buffer. */
    recv_buf_free = (char*) xbt_malloc(buf_size);
    recv_buf = recv_buf_free - lb;
    if (NULL == recv_buf_free) {
        err = MPI_ERR_OTHER;
        goto cleanup;
    }
   
    /* allocate temporary buffer for results */
    result_buf_free = (char*) xbt_malloc(buf_size);
    result_buf = result_buf_free - lb;
   
    /* copy local buffer into the temporary results */
    err =smpi_datatype_copy(sbuf, count, dtype, result_buf, count, dtype);
    if (MPI_SUCCESS != err) goto cleanup;
   
    /* figure out power of two mapping: grow until larger than
       comm size, then go back one, to get the largest power of
       two less than comm size */
    while (tmp_size <= size) tmp_size <<= 1;
    tmp_size >>= 1;
    remain = size - tmp_size;
   
    /* If comm size is not a power of two, have the first "remain"
       procs with an even rank send to rank + 1, leaving a power of
       two procs to do the rest of the algorithm */
    if (rank < 2 * remain) {
        if ((rank & 1) == 0) {
            smpi_mpi_send(result_buf, count, dtype, rank + 1, 
                                    COLL_TAG_REDUCE_SCATTER,
                                    comm);
            /* we don't participate from here on out */
            tmp_rank = -1;
        } else {
            smpi_mpi_recv(recv_buf, count, dtype, rank - 1,
                                    COLL_TAG_REDUCE_SCATTER,
                                    comm, MPI_STATUS_IGNORE);
         
            /* integrate their results into our temp results */
            smpi_op_apply(op, recv_buf, result_buf, &count, &dtype);
         
            /* adjust rank to be the bottom "remain" ranks */
            tmp_rank = rank / 2;
        }
    } else {
        /* just need to adjust rank to show that the bottom "even
           remain" ranks dropped out */
        tmp_rank = rank - remain;
    }
   
    /* For ranks not kicked out by the above code, perform the
       recursive halving */
    if (tmp_rank >= 0) {
        int *tmp_disps = NULL, *tmp_rcounts = NULL;
        int mask, send_index, recv_index, last_index;
      
        /* recalculate disps and rcounts to account for the
           special "remainder" processes that are no longer doing
           anything */
        tmp_rcounts = (int*) xbt_malloc(tmp_size * sizeof(int));
        if (NULL == tmp_rcounts) {
            err = MPI_ERR_OTHER;
            goto cleanup;
        }
        tmp_disps = (int*) xbt_malloc(tmp_size * sizeof(int));
        if (NULL == tmp_disps) {
            xbt_free(tmp_rcounts);
            err = MPI_ERR_OTHER;
            goto cleanup;
        }

        for (i = 0 ; i < tmp_size ; ++i) {
            if (i < remain) {
                /* need to include old neighbor as well */
                tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
            } else {
                tmp_rcounts[i] = rcounts[i + remain];
            }
        }

        tmp_disps[0] = 0;
        for (i = 0; i < tmp_size - 1; ++i) {
            tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
        }

        /* do the recursive halving communication.  Don't use the
           dimension information on the communicator because I
           think the information is invalidated by our "shrinking"
           of the communicator */
        mask = tmp_size >> 1;
        send_index = recv_index = 0;
        last_index = tmp_size;
        while (mask > 0) {
            int tmp_peer, peer, send_count, recv_count;
            MPI_Request request;

            tmp_peer = tmp_rank ^ mask;
            peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

            /* figure out if we're sending, receiving, or both */
            send_count = recv_count = 0;
            if (tmp_rank < tmp_peer) {
                send_index = recv_index + mask;
                for (i = send_index ; i < last_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < send_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            } else {
                recv_index = send_index + mask;
                for (i = send_index ; i < recv_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < last_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            }

            /* actual data transfer.  Send from result_buf,
               receive into recv_buf */
            if (send_count > 0 && recv_count != 0) {
                request=smpi_mpi_irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                                         recv_count, dtype, peer,
                                         COLL_TAG_REDUCE_SCATTER,
                                         comm);
                if (MPI_SUCCESS != err) {
                    xbt_free(tmp_rcounts);
                    xbt_free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (recv_count > 0 && send_count != 0) {
                smpi_mpi_send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
                                        send_count, dtype, peer, 
                                        COLL_TAG_REDUCE_SCATTER,
                                        comm);
                if (MPI_SUCCESS != err) {
                    xbt_free(tmp_rcounts);
                    xbt_free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (send_count > 0 && recv_count != 0) {
                smpi_mpi_wait(&request, MPI_STATUS_IGNORE);
            }

            /* if we received something on this step, push it into
               the results buffer */
            if (recv_count > 0) {
                smpi_op_apply(op, 
                               recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, 
                               result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                               &recv_count, &dtype);
            }

            /* update for next iteration */
            send_index = recv_index;
            last_index = recv_index + mask;
            mask >>= 1;
        }

        /* copy local results from results buffer into real receive buffer */
        if (0 != rcounts[rank]) {
            err = smpi_datatype_copy(result_buf + disps[rank] * extent,
                                       rcounts[rank], dtype, 
                                       rbuf, rcounts[rank], dtype);
            if (MPI_SUCCESS != err) {
                xbt_free(tmp_rcounts);
                xbt_free(tmp_disps);
                goto cleanup;
            }                                             
        }

        xbt_free(tmp_rcounts);
        xbt_free(tmp_disps);
    }
 int smpi_coll_tuned_allreduce_mvapich2_rs(void *sendbuf,
                             void *recvbuf,
                             int count,
                             MPI_Datatype datatype,
                             MPI_Op op, MPI_Comm comm)
{
    int comm_size, rank;
    int mpi_errno = MPI_SUCCESS;
    int mask, dst, is_commutative, pof2, newrank = 0, rem, newdst, i,
        send_idx, recv_idx, last_idx, send_cnt, recv_cnt, *cnts, *disps;
    MPI_Aint true_lb, true_extent, extent;
    void *tmp_buf, *tmp_buf_free;

    if (count == 0) {
        return MPI_SUCCESS;
    }

    /* homogeneous */

    comm_size =  smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    is_commutative = smpi_op_is_commute(op);

    /* need to allocate temporary buffer to store incoming data */
    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    tmp_buf_free= smpi_get_tmp_recvbuffer(count * (MAX(extent, true_extent)));

    /* adjust for potential negative lower bound in datatype */
    tmp_buf = (void *) ((char *) tmp_buf_free - true_lb);

    /* copy local data into recvbuf */
    if (sendbuf != MPI_IN_PLACE) {
        mpi_errno =
            smpi_datatype_copy(sendbuf, count, datatype, recvbuf, count,
                           datatype);
    }

    /* find nearest power-of-two less than or equal to comm_size */
    for( pof2 = 1; pof2 <= comm_size; pof2 <<= 1 );
    pof2 >>=1;

    rem = comm_size - pof2;

    /* In the non-power-of-two case, all even-numbered
       processes of rank < 2*rem send their data to
       (rank+1). These even-numbered processes no longer
       participate in the algorithm until the very end. The
       remaining processes form a nice power-of-two. */

    if (rank < 2 * rem) {
        if (rank % 2 == 0) {
            /* even */
            smpi_mpi_send(recvbuf, count, datatype, rank + 1,
                                     COLL_TAG_ALLREDUCE, comm);

            /* temporarily set the rank to -1 so that this
               process does not pariticipate in recursive
               doubling */
            newrank = -1;
        } else {
            /* odd */
            smpi_mpi_recv(tmp_buf, count, datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm,
                                     MPI_STATUS_IGNORE);
            /* do the reduction on received data. since the
               ordering is right, it doesn't matter whether
               the operation is commutative or not. */
               smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                /* change the rank */
                newrank = rank / 2;
        }
    } else {                /* rank >= 2*rem */
        newrank = rank - rem;
    }

    /* If op is user-defined or count is less than pof2, use
       recursive doubling algorithm. Otherwise do a reduce-scatter
       followed by allgather. (If op is user-defined,
       derived datatypes are allowed and the user could pass basic
       datatypes on one process and derived on another as long as
       the type maps are the same. Breaking up derived
       datatypes to do the reduce-scatter is tricky, therefore
       using recursive doubling in that case.) */

    if (newrank != -1) {
        if (/*(HANDLE_GET_KIND(op) != HANDLE_KIND_BUILTIN) ||*/ (count < pof2)) {  /* use recursive doubling */
            mask = 0x1;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                /* Send the most current data, which is in recvbuf. Recv
                   into tmp_buf */
                smpi_mpi_sendrecv(recvbuf, count, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             tmp_buf, count, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                if (is_commutative || (dst < rank)) {
                    /* op is commutative OR the order is already right */
                     smpi_op_apply(op, tmp_buf, recvbuf, &count, &datatype);
                } else {
                    /* op is noncommutative and the order is not right */
                    smpi_op_apply(op, recvbuf, tmp_buf, &count, &datatype);
                    /* copy result back into recvbuf */
                    mpi_errno = smpi_datatype_copy(tmp_buf, count, datatype,
                                               recvbuf, count, datatype);
                }
                mask <<= 1;
            }
        } else {

            /* do a reduce-scatter followed by allgather */

            /* for the reduce-scatter, calculate the count that
               each process receives and the displacement within
               the buffer */
            cnts = (int *)xbt_malloc(pof2 * sizeof (int));
            disps = (int *)xbt_malloc(pof2 * sizeof (int));

            for (i = 0; i < (pof2 - 1); i++) {
                cnts[i] = count / pof2;
            }
            cnts[pof2 - 1] = count - (count / pof2) * (pof2 - 1);

            disps[0] = 0;
            for (i = 1; i < pof2; i++) {
                disps[i] = disps[i - 1] + cnts[i - 1];
            }

            mask = 0x1;
            send_idx = recv_idx = 0;
            last_idx = pof2;
            while (mask < pof2) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    send_idx = recv_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < send_idx; i++)
                        recv_cnt += cnts[i];
                } else {
                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++)
                        send_cnt += cnts[i];
                    for (i = recv_idx; i < last_idx; i++)
                        recv_cnt += cnts[i];
                }

                /* Send data from recvbuf. Recv into tmp_buf */
                smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) tmp_buf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);

                /* tmp_buf contains data received in this step.
                   recvbuf contains data accumulated so far */

                /* This algorithm is used only for predefined ops
                   and predefined ops are always commutative. */

                smpi_op_apply(op, (char *) tmp_buf + disps[recv_idx] * extent,
                        (char *) recvbuf + disps[recv_idx] * extent,
                        &recv_cnt, &datatype);

                /* update send_idx for next iteration */
                send_idx = recv_idx;
                mask <<= 1;

                /* update last_idx, but not in last iteration
                   because the value is needed in the allgather
                   step below. */
                if (mask < pof2)
                    last_idx = recv_idx + pof2 / mask;
            }

            /* now do the allgather */

            mask >>= 1;
            while (mask > 0) {
                newdst = newrank ^ mask;
                /* find real rank of dest */
                dst = (newdst < rem) ? newdst * 2 + 1 : newdst + rem;

                send_cnt = recv_cnt = 0;
                if (newrank < newdst) {
                    /* update last_idx except on first iteration */
                    if (mask != pof2 / 2) {
                        last_idx = last_idx + pof2 / (mask * 2);
                    }

                    recv_idx = send_idx + pof2 / (mask * 2);
                    for (i = send_idx; i < recv_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < last_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                } else {
                    recv_idx = send_idx - pof2 / (mask * 2);
                    for (i = send_idx; i < last_idx; i++) {
                        send_cnt += cnts[i];
                    }
                    for (i = recv_idx; i < send_idx; i++) {
                        recv_cnt += cnts[i];
                    }
                }

               smpi_mpi_sendrecv((char *) recvbuf +
                                             disps[send_idx] * extent,
                                             send_cnt, datatype,
                                             dst, COLL_TAG_ALLREDUCE,
                                             (char *) recvbuf +
                                             disps[recv_idx] * extent,
                                             recv_cnt, datatype, dst,
                                             COLL_TAG_ALLREDUCE, comm,
                                             MPI_STATUS_IGNORE);
                if (newrank > newdst) {
                    send_idx = recv_idx;
                }

                mask >>= 1;
            }
        }
    }

    /* In the non-power-of-two case, all odd-numbered
       processes of rank < 2*rem send the result to
       (rank-1), the ranks who didn't participate above. */
    if (rank < 2 * rem) {
        if (rank % 2) {     /* odd */
            smpi_mpi_send(recvbuf, count,
                                     datatype, rank - 1,
                                     COLL_TAG_ALLREDUCE, comm);
        } else {            /* even */
            smpi_mpi_recv(recvbuf, count,
                                  datatype, rank + 1,
                                  COLL_TAG_ALLREDUCE, comm,
                                  MPI_STATUS_IGNORE);
        }
    }
    smpi_free_tmp_buffer(tmp_buf_free);
    return (mpi_errno);

}
Exemple #21
0
/**
 * Alltoall Bruck
 *
 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
 * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not
 * less...
 **/
int smpi_coll_tuned_alltoallv_bruck(void *sendbuf, int *sendcounts, int *senddisps,
                                   MPI_Datatype sendtype, void *recvbuf,
                                   int *recvcounts, int *recvdisps, MPI_Datatype recvtype,
                                   MPI_Comm comm)
{
  int system_tag = 777;
  int i, rank, size, err, count;
  MPI_Aint lb;
  MPI_Aint sendext = 0;
  MPI_Aint recvext = 0;
  MPI_Request *requests;

  // FIXME: check implementation
  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank);

  err = smpi_datatype_extent(sendtype, &lb, &sendext);
  err = smpi_datatype_extent(recvtype, &lb, &recvext);
  /* Local copy from self */
  err =
      smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext, 
                         sendcounts[rank], sendtype, 
                         (char *)recvbuf + recvdisps[rank] * recvext,
                         recvcounts[rank], recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */
    requests = xbt_new(MPI_Request, 2 * (size - 1));
    count = 0;
    /* Create all receives that will be posted first */
    for (i = 0; i < size; ++i) {
      if (i == rank) {
        XBT_DEBUG("<%d> skip request creation [src = %d, recvcount = %d]",
               rank, i, recvcounts[i]);
        continue;
      }
      requests[count] =
          smpi_irecv_init((char *)recvbuf + recvdisps[i] * recvext, recvcounts[i],
                          recvtype, i, system_tag, comm);
      count++;
    }
    /* Now create all sends  */
    for (i = 0; i < size; ++i) {
      if (i == rank) {
        XBT_DEBUG("<%d> skip request creation [dst = %d, sendcount = %d]",
               rank, i, sendcounts[i]);
        continue;
      }
      requests[count] =
          smpi_isend_init((char *)sendbuf + senddisps[i] * sendext, sendcounts[i],
                          sendtype, i, system_tag, comm);
      count++;
    }
    /* Wait for them all. */
    smpi_mpi_startall(count, requests);
    XBT_DEBUG("<%d> wait for %d requests", rank, count);
    smpi_mpi_waitall(count, requests, MPI_STATUS_IGNORE);
    xbt_free(requests);
  }
  return MPI_SUCCESS;
}
int 
smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount,
                                                  MPI_Datatype sdtype,
                                                  void* rbuf, int *rcounts, int *rdispls,
                                                  MPI_Datatype rdtype,
                                                  MPI_Comm comm)
{
    int line = -1;
    int rank, size;
    int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
  
    int i, even_rank;
    int err = 0;
    ptrdiff_t slb, rlb, sext, rext;
    char *tmpsend = NULL, *tmprecv = NULL;


    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    if (size % 2) {
        XBT_DEBUG(
                     "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                     size);
        return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype,
                                                     rbuf, rcounts, 
                                                     rdispls, rdtype,
                                                     comm);
    }

    XBT_DEBUG(
                 "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank);

    err = smpi_datatype_extent (sdtype, &slb, &sext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    err = smpi_datatype_extent (rdtype, &rlb, &rext);
    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

    /* Initialization step:
       - if send buffer is not MPI_IN_PLACE, copy send buffer to 
       the appropriate block of receive buffer
    */
    tmprecv = (char*) rbuf + rdispls[rank] * rext;
    if (MPI_IN_PLACE != sbuf) {
        tmpsend = (char*) sbuf;
        err = smpi_datatype_copy(tmpsend, scount, sdtype, 
                              tmprecv, rcounts[rank], rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl;  }
    } 

    /* Determine neighbors, order in which blocks will arrive, etc. */
    even_rank = !(rank % 2);
    if (even_rank) {
        neighbor[0] = (rank + 1) % size;
        neighbor[1] = (rank - 1 + size) % size;
        recv_data_from[0] = rank;
        recv_data_from[1] = rank;
        offset_at_step[0] = (+2);
        offset_at_step[1] = (-2);
    } else {
        neighbor[0] = (rank - 1 + size) % size;
        neighbor[1] = (rank + 1) % size;
        recv_data_from[0] = neighbor[0];
        recv_data_from[1] = neighbor[0];
        offset_at_step[0] = (-2);
        offset_at_step[1] = (+2);
    }

    /* Communication loop:
       - First step is special: exchange a single block with neighbor[0].
       - Rest of the steps: 
       update recv_data_from according to offset, and 
       exchange two blocks with appropriate neighbor.
       the send location becomes previous receve location.
       Note, we need to create indexed datatype to send and receive these
       blocks properly.
    */
    tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext;
    tmpsend = (char*)rbuf + rdispls[rank] * rext;
    smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   tmprecv, rcounts[neighbor[0]], rdtype, 
                                   neighbor[0], COLL_TAG_ALLGATHERV,
                                   comm, MPI_STATUS_IGNORE);



  
   
    /* Determine initial sending counts and displacements*/
    if (even_rank) {
        send_data_from = rank;
    } else {
        send_data_from = recv_data_from[0];
    }

    for (i = 1; i < (size / 2); i++) {
        MPI_Datatype new_rdtype, new_sdtype;
        int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2];
        const int i_parity = i % 2;
        recv_data_from[i_parity] = 
            (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

        /* Create new indexed types for sending and receiving.
           We are sending data from ranks (send_data_from) and (send_data_from+1)
           We are receiving data from ranks (recv_data_from[i_parity]) and
           (recv_data_from[i_parity]+1).
        */
        
        new_scounts[0] = rcounts[send_data_from];
        new_scounts[1] = rcounts[(send_data_from + 1)];
        new_sdispls[0] = rdispls[send_data_from];
        new_sdispls[1] = rdispls[(send_data_from + 1)];
        err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, 
                                      &new_sdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_sdtype);

        new_rcounts[0] = rcounts[recv_data_from[i_parity]];
        new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)];
        new_rdispls[0] = rdispls[recv_data_from[i_parity]];
        new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)];
        err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, 
                                      &new_rdtype);
        if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
        smpi_datatype_commit(&new_rdtype);
      
        tmprecv = (char*)rbuf;
        tmpsend = (char*)rbuf;
      
        /* Sendreceive */
        smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       tmprecv, 1, new_rdtype, neighbor[i_parity],
                                       COLL_TAG_ALLGATHERV,
                                       comm, MPI_STATUS_IGNORE);

        send_data_from = recv_data_from[i_parity];
      
        smpi_datatype_free(&new_sdtype);
        smpi_datatype_free(&new_rdtype);
    }

    return MPI_SUCCESS;

 err_hndl:
    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
                 __FILE__, line, err, rank);
    return err;
}
/*  
 * Linear functions are copied from the basic coll module.  For
 * some small number of nodes and/or small data sizes they are just as
 * fast as tuned/tree based segmenting operations and as such may be
 * selected by the decision functions.  These are copied into this module
 * due to the way we select modules in V1. i.e. in V2 we will handle this
 * differently and so will not have to duplicate code.  
 * GEF Oct05 after asking Jeff.  
 */
int
smpi_coll_tuned_alltoallv_ompi_basic_linear(void *sbuf, int *scounts, int *sdisps,
                                            MPI_Datatype sdtype,
                                            void *rbuf, int *rcounts, int *rdisps,
                                            MPI_Datatype rdtype,
                                            MPI_Comm comm)
{
    int i, size, rank;
    char *psnd, *prcv;
    int nreqs;
    ptrdiff_t sext, rext;
    MPI_Request *preq;
    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    MPI_Request *ireqs= xbt_malloc(sizeof(MPI_Request) * size * 2);
    XBT_DEBUG(
                 "coll:tuned:alltoallv_intra_basic_linear rank %d", rank);

    sext=smpi_datatype_get_extent(sdtype);
    rext=smpi_datatype_get_extent(rdtype);

    /* Simple optimization - handle send to self first */
    psnd = ((char *) sbuf) + (sdisps[rank] * sext);
    prcv = ((char *) rbuf) + (rdisps[rank] * rext);
    if (0 != scounts[rank]) {
        smpi_datatype_copy(psnd, scounts[rank], sdtype,
                              prcv, rcounts[rank], rdtype);
    }

    /* If only one process, we're done. */
    if (1 == size) {
        return MPI_SUCCESS;
    }

    /* Now, initiate all send/recv to/from others. */
    nreqs = 0;
    preq = ireqs;

    /* Post all receives first */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == rcounts[i]) {
            continue;
        }

        prcv = ((char *) rbuf) + (rdisps[i] * rext);

        *preq = smpi_irecv_init(prcv, rcounts[i], rdtype,
                                      i, COLL_TAG_ALLTOALLV, comm
                                      );
        preq++;
        ++nreqs;
        
    }

    /* Now post all sends */
    for (i = 0; i < size; ++i) {
        if (i == rank || 0 == scounts[i]) {
            continue;
        }

        psnd = ((char *) sbuf) + (sdisps[i] * sext);
        *preq=smpi_isend_init(psnd, scounts[i], sdtype,
                                      i, COLL_TAG_ALLTOALLV, comm
                                      );
        preq++;
        ++nreqs;
    }

    /* Start your engines.  This will never return an error. */
    smpi_mpi_startall(nreqs, ireqs);

    /* Wait for them all.  If there's an error, note that we don't care
     * what the error was -- just that there *was* an error.  The PML
     * will finish all requests, even if one or more of them fail.
     * i.e., by the end of this call, all the requests are free-able.
     * So free them anyway -- even if there was an error, and return the
     * error after we free everything. */
    smpi_mpi_waitall(nreqs, ireqs,
                                MPI_STATUSES_IGNORE);

    /* Free the requests. */
    for (i = 0; i < nreqs; ++i) {
      if(ireqs[i]!=MPI_REQUEST_NULL)smpi_mpi_request_free(&ireqs[i]);
    }

    return MPI_SUCCESS;
}
int 
smpi_coll_tuned_allgather_ompi_neighborexchange(void *sbuf, int scount,
                                                 MPI_Datatype sdtype,
                                                 void* rbuf, int rcount,
                                                 MPI_Datatype rdtype,
                                                 MPI_Comm comm
)
{
   int line = -1;
   int rank, size;
   int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from;
   int i, even_rank;
   int err = 0;
   ptrdiff_t slb, rlb, sext, rext;
   char *tmpsend = NULL, *tmprecv = NULL;

   size = smpi_comm_size(comm);
   rank = smpi_comm_rank(comm);

   if (size % 2) {
      XBT_DEBUG(
                   "coll:tuned:allgather_intra_neighborexchange WARNING: odd size %d, switching to ring algorithm", 
                   size);
      return smpi_coll_tuned_allgather_ring(sbuf, scount, sdtype,
                                                  rbuf, rcount, rdtype,
                                                  comm);
   }

   XBT_DEBUG(
                "coll:tuned:allgather_intra_neighborexchange rank %d", rank);

   err = smpi_datatype_extent (sdtype, &slb, &sext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   err = smpi_datatype_extent (rdtype, &rlb, &rext);
   if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

   /* Initialization step:
      - if send buffer is not MPI_IN_PLACE, copy send buffer to appropriate block
        of receive buffer
   */
   tmprecv = (char*) rbuf + rank * rcount * rext;
   if (MPI_IN_PLACE != sbuf) {
      tmpsend = (char*) sbuf;
      smpi_datatype_copy (tmpsend, scount, sdtype, tmprecv, rcount, rdtype);
   } 

   /* Determine neighbors, order in which blocks will arrive, etc. */
   even_rank = !(rank % 2);
   if (even_rank) {
      neighbor[0] = (rank + 1) % size;
      neighbor[1] = (rank - 1 + size) % size;
      recv_data_from[0] = rank;
      recv_data_from[1] = rank;
      offset_at_step[0] = (+2);
      offset_at_step[1] = (-2);
   } else {
      neighbor[0] = (rank - 1 + size) % size;
      neighbor[1] = (rank + 1) % size;
      recv_data_from[0] = neighbor[0];
      recv_data_from[1] = neighbor[0];
      offset_at_step[0] = (-2);
      offset_at_step[1] = (+2);
   }

   /* Communication loop:
      - First step is special: exchange a single block with neighbor[0].
      - Rest of the steps: 
        update recv_data_from according to offset, and 
        exchange two blocks with appropriate neighbor.
        the send location becomes previous receve location.
   */
   tmprecv = (char*)rbuf + neighbor[0] * rcount * rext;
   tmpsend = (char*)rbuf + rank * rcount * rext;
   /* Sendreceive */
   smpi_mpi_sendrecv(tmpsend, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  tmprecv, rcount, rdtype, neighbor[0],
                                  COLL_TAG_ALLGATHER,
                                  comm, MPI_STATUS_IGNORE);

   /* Determine initial sending location */
   if (even_rank) {
      send_data_from = rank;
   } else {
      send_data_from = recv_data_from[0];
   }

   for (i = 1; i < (size / 2); i++) {
      const int i_parity = i % 2;
      recv_data_from[i_parity] = 
         (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size;

      tmprecv = (char*)rbuf + recv_data_from[i_parity] * rcount * rext;
      tmpsend = (char*)rbuf + send_data_from * rcount * rext;
      
      /* Sendreceive */
      smpi_mpi_sendrecv(tmpsend, 2 * rcount, rdtype, 
                                     neighbor[i_parity], 
                                     COLL_TAG_ALLGATHER,
                                     tmprecv, 2 * rcount, rdtype,
                                     neighbor[i_parity],
                                     COLL_TAG_ALLGATHER,
                                     comm, MPI_STATUS_IGNORE);

      send_data_from = recv_data_from[i_parity];
   }

   return MPI_SUCCESS;

 err_hndl:
   XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d",
                __FILE__, line, err, rank);
   return err;
}
Exemple #25
0
int
smpi_coll_tuned_scatter_ompi_binomial(void *sbuf, int scount,
				       MPI_Datatype sdtype,
				       void *rbuf, int rcount,
				       MPI_Datatype rdtype,
				       int root,
				       MPI_Comm comm
				       )
{
    int line = -1;
    int i;
    int rank;
    int vrank;
    int size;
    int total_send = 0;
    char *ptmp     = NULL;
    char *tempbuf  = NULL;
    int err;
    ompi_coll_tree_t* bmtree;
    MPI_Status status;
    MPI_Aint sextent, slb, strue_lb, strue_extent; 
    MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent;

    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);

    XBT_DEBUG(
                 "smpi_coll_tuned_scatter_ompi_binomial rank %d", rank);

    /* create the binomial tree */
    
//    COLL_TUNED_UPDATE_IN_ORDER_BMTREE( comm, tuned_module, root );
    bmtree =  ompi_coll_tuned_topo_build_in_order_bmtree( comm, root);//ompi_ data->cached_in_order_bmtree;

    smpi_datatype_extent(sdtype, &slb, &sextent);
    smpi_datatype_extent(sdtype, &strue_lb, &strue_extent);
    smpi_datatype_extent(rdtype, &rlb, &rextent);
    smpi_datatype_extent(rdtype, &rtrue_lb, &rtrue_extent);

    vrank = (rank - root + size) % size;

    if (rank == root) {
	if (0 == root) {
	    /* root on 0, just use the send buffer */
	    ptmp = (char *) sbuf;
	    if (rbuf != MPI_IN_PLACE) {
		/* local copy to rbuf */
		err = smpi_datatype_copy(sbuf, scount, sdtype,
				      rbuf, rcount, rdtype);
		if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
	    }
	} else {
	    /* root is not on 0, allocate temp buffer for send */
	    tempbuf = (char *) malloc(strue_extent + (scount*size - 1) * sextent);
	    if (NULL == tempbuf) {
		err = MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
	    }

	    ptmp = tempbuf - slb;

	    /* and rotate data so they will eventually in the right place */
	    err = smpi_datatype_copy((char *) sbuf + sextent*root*scount, scount*(size-root), sdtype,
            ptmp, scount*(size-root), sdtype);
	    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }


	    err = smpi_datatype_copy((char*)sbuf, scount*root, sdtype,
						 ptmp + sextent*scount*(size - root), scount*root, sdtype);
	    if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }

	    if (rbuf != MPI_IN_PLACE) {
		/* local copy to rbuf */
		err = smpi_datatype_copy(ptmp, scount, sdtype,
				      rbuf, rcount, rdtype);
		if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; }
	    }
	}
	total_send = scount;
    } else if (!(vrank % 2)) {
	/* non-root, non-leaf nodes, allocte temp buffer for recv
	 * the most we need is rcount*size/2 */
	tempbuf = (char *) malloc(rtrue_extent + (rcount*size - 1) * rextent);
	if (NULL == tempbuf) {
	    err= MPI_ERR_OTHER; line = __LINE__; goto err_hndl;
	}

	ptmp = tempbuf - rlb;

	sdtype = rdtype;
	scount = rcount;
	sextent = rextent;
	total_send = scount;
    } else {
	/* leaf nodes, just use rbuf */
	ptmp = (char *) rbuf;
    }

    if (!(vrank % 2)) {
	if (rank != root) {
	    /* recv from parent on non-root */
	    smpi_mpi_recv(ptmp, rcount*size, rdtype, bmtree->tree_prev,
				    COLL_TAG_SCATTER, comm, &status);
	    /* local copy to rbuf */
	    err = smpi_datatype_copy(ptmp, scount, sdtype,
				  rbuf, rcount, rdtype);
	}
	/* send to children on all non-leaf */
	for (i = 0; i < bmtree->tree_nextsize; i++) {
	    int mycount = 0, vkid;
	    /* figure out how much data I have to send to this child */
	    vkid = (bmtree->tree_next[i] - root + size) % size;
	    mycount = vkid - vrank;
	    if (mycount > (size - vkid))
		mycount = size - vkid;
	    mycount *= scount;

	    smpi_mpi_send(ptmp + total_send*sextent, mycount, sdtype,
				    bmtree->tree_next[i],
				    COLL_TAG_SCATTER,
				     comm);

	    total_send += mycount;
	}

	if (NULL != tempbuf) 
	    free(tempbuf);
    } else {
	/* recv from parent on leaf nodes */
	smpi_mpi_recv(ptmp, rcount, rdtype, bmtree->tree_prev,
				COLL_TAG_SCATTER, comm, &status);
    }
    //!FIXME : store the tree, as done in ompi, instead of calculating it each time ?
    xbt_free(bmtree);

    return MPI_SUCCESS;

 err_hndl:
    if (NULL != tempbuf)
	free(tempbuf);

    XBT_DEBUG(  "%s:%4d\tError occurred %d, rank %2d",
		 __FILE__, line, err, rank);
    return err;
}