Exemple #1
0
static void action_Irecv(const char *const *action)
{
  CHECK_ACTION_PARAMS(action, 2, 1);
  int from = atoi(action[2]);
  double size=parse_double(action[3]);
  double clock = smpi_process_simulated_elapsed();
  MPI_Request request;

  if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]);
  else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE;

  int rank = smpi_process_index();
  int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from);
  instr_extra_data extra = xbt_new0(s_instr_extra_data_t,1);
  extra->type = TRACING_IRECV;
  extra->send_size = size;
  extra->src = src_traced;
  extra->dst = rank;
  extra->datatype1 = encode_datatype(MPI_CURRENT_TYPE, NULL);
  TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__, extra);
  MPI_Status status;
  //unknow size from the receiver pov
  if(size==-1){
      smpi_mpi_probe(from, 0, MPI_COMM_WORLD, &status);
      size=status.count;
  }

  request = smpi_mpi_irecv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD);

  TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__);
  request->recv = 1;
  xbt_dynar_push(get_reqq_self(),&request);

  log_timed_action (action, clock);
}
// Allgather-Non-Topoloty-Scecific-Logical-Ring algorithm
int
smpi_coll_tuned_allgather_NTSLR_NB(void *sbuf, int scount, MPI_Datatype stype,
                                   void *rbuf, int rcount, MPI_Datatype rtype,
                                   MPI_Comm comm)
{
    MPI_Aint rextent, sextent;
    MPI_Status status, status2;
    int i, to, from, rank, size;
    int send_offset, recv_offset;
    int tag = COLL_TAG_ALLGATHER;

    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
    rextent = smpi_datatype_get_extent(rtype);
    sextent = smpi_datatype_get_extent(stype);
    MPI_Request *rrequest_array;
    MPI_Request *srequest_array;
    rrequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));
    srequest_array = (MPI_Request *) xbt_malloc(size * sizeof(MPI_Request));

    // irregular case use default MPI fucntions
    if (scount * sextent != rcount * rextent) {
        XBT_WARN("MPI_allgather_NTSLR_NB use default MPI_allgather.");
        smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
        return MPI_SUCCESS;
    }

    // topo non-specific
    to = (rank + 1) % size;
    from = (rank + size - 1) % size;

    //copy a single segment from sbuf to rbuf
    send_offset = rank * scount * sextent;

    smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
                      (char *)rbuf + send_offset, rcount, rtype, rank, tag, comm, &status);


    //start sending logical ring message
    int increment = scount * sextent;

    //post all irecv first
    for (i = 0; i < size - 1; i++) {
        recv_offset = ((rank - i - 1 + size) % size) * increment;
        rrequest_array[i] = smpi_mpi_irecv((char *)rbuf + recv_offset, rcount, rtype, from, tag + i, comm);
    }


    for (i = 0; i < size - 1; i++) {
        send_offset = ((rank - i + size) % size) * increment;
        srequest_array[i] = smpi_mpi_isend((char *)rbuf + send_offset, scount, stype, to, tag + i, comm);
        smpi_mpi_wait(&rrequest_array[i], &status);
        smpi_mpi_wait(&srequest_array[i], &status2);
    }

    free(rrequest_array);
    free(srequest_array);

    return MPI_SUCCESS;
}
Exemple #3
0
static void action_Irecv(const char *const *action)
{
  int from = atoi(action[2]);
  double size=parse_double(action[3]);
  double clock = smpi_process_simulated_elapsed();
  MPI_Request request;
  smpi_replay_globals_t globals =
     (smpi_replay_globals_t) smpi_process_get_user_data();

#ifdef HAVE_TRACING
  int rank = smpi_comm_rank(MPI_COMM_WORLD);
  int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from);
  TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__);
#endif

  request = smpi_mpi_irecv(NULL, size, MPI_BYTE, from, 0, MPI_COMM_WORLD);
#ifdef HAVE_TRACING
  TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__);
  request->recv = 1;
#endif
  xbt_dynar_push(globals->irecvs,&request);

  //TODO do the asynchronous cleanup
  if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){
    char *name = xbt_str_join_array(action, " ");
    XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock);
    free(name);
  }
}
Exemple #4
0
static void action_Irecv(const char *const *action)
{
  int from = atoi(action[2]);
  double size=parse_double(action[3]);
  double clock = smpi_process_simulated_elapsed();
  MPI_Request request;

  smpi_replay_globals_t globals =
     (smpi_replay_globals_t) smpi_process_get_user_data();

  if(action[4]) MPI_CURRENT_TYPE=decode_datatype(action[4]);
  else MPI_CURRENT_TYPE= MPI_DEFAULT_TYPE;

#ifdef HAVE_TRACING
  int rank = smpi_comm_rank(MPI_COMM_WORLD);
  int src_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), from);
  TRACE_smpi_ptp_in(rank, src_traced, rank, __FUNCTION__);
#endif

  request = smpi_mpi_irecv(NULL, size, MPI_CURRENT_TYPE, from, 0, MPI_COMM_WORLD);

#ifdef HAVE_TRACING
  TRACE_smpi_ptp_out(rank, src_traced, rank, __FUNCTION__);
  request->recv = 1;
#endif
  xbt_dynar_push(globals->irecvs,&request);
  xbt_dynar_push(reqq[smpi_comm_rank(MPI_COMM_WORLD)],&request);

  log_timed_action (action, clock);
}
/*****************************************************************************
 * Function: allgather_spreading_simple
 * return: int
 *  inputs:
 *   send_buff: send input buffer
 *   send_count: number of elements to send
 *   send_type: data type of elements being sent
 *   recv_buff: receive output buffer
 *   recv_count: number of elements to received
 *   recv_type: data type of elements being received
 *   comm: communication
 * Descrp: Let i -> j denote the communication from node i to node j. The
 *         order of communications for node i is i -> i + 1, i -> i + 2, ...,
 *         i -> (i + p -1) % P.
 *
 * Auther: Ahmad Faraj
 ****************************************************************************/
int
smpi_coll_tuned_allgather_spreading_simple(void *send_buff, int send_count,
                                           MPI_Datatype send_type,
                                           void *recv_buff, int recv_count,
                                           MPI_Datatype recv_type,
                                           MPI_Comm comm)
{
  MPI_Request *reqs, *req_ptr;
  MPI_Aint extent;
  int i, src, dst, rank, num_procs, num_reqs;
  int tag = COLL_TAG_ALLGATHER;
  MPI_Status status;
  char *recv_ptr = (char *) recv_buff;

  rank = smpi_comm_rank(comm);
  num_procs = smpi_comm_size(comm);
  extent = smpi_datatype_get_extent(send_type);

  num_reqs = (2 * num_procs) - 2;
  reqs = (MPI_Request *) xbt_malloc(num_reqs * sizeof(MPI_Request));
  if (!reqs) {
    printf("allgather-spreading-simple.c:40: cannot allocate memory\n");
    MPI_Finalize();
    exit(0);
  }

  req_ptr = reqs;
  smpi_mpi_sendrecv(send_buff, send_count, send_type, rank, tag,
               (char *) recv_buff + rank * recv_count * extent, recv_count,
               recv_type, rank, tag, comm, &status);

  for (i = 0; i < num_procs; i++) {
    src = (rank + i) % num_procs;
    if (src == rank)
      continue;
    *(req_ptr++) = smpi_mpi_irecv(recv_ptr + src * recv_count * extent, recv_count, recv_type,
              src, tag, comm);
  }

  for (i = 0; i < num_procs; i++) {
    dst = (rank + i) % num_procs;
    if (dst == rank)
      continue;
    *(req_ptr++) = smpi_mpi_isend(send_buff, send_count, send_type, dst, tag, comm);
  }

  smpi_mpi_waitall(num_reqs, reqs, MPI_STATUSES_IGNORE);
  free(reqs);

  return MPI_SUCCESS;
}
Exemple #6
0
int smpi_coll_tuned_barrier_ompi_basic_linear(MPI_Comm comm)
{
    int i;
    int size = smpi_comm_size(comm);
    int rank = smpi_comm_rank(comm);

    /* All non-root send & receive zero-length message. */

    if (rank > 0) {
        smpi_mpi_send (NULL, 0, MPI_BYTE, 0, 
                                 COLL_TAG_BARRIER,
                                  comm);

        smpi_mpi_recv (NULL, 0, MPI_BYTE, 0, 
                                 COLL_TAG_BARRIER,
                                 comm, MPI_STATUS_IGNORE);
    }

    /* The root collects and broadcasts the messages. */

    else {
        MPI_Request* requests;

        requests = (MPI_Request*)malloc( size * sizeof(MPI_Request) );
        for (i = 1; i < size; ++i) {
            requests[i] = smpi_mpi_irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
                                     COLL_TAG_BARRIER, comm
                                     );
        }
        smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );

        for (i = 1; i < size; ++i) {
            requests[i] = smpi_mpi_isend(NULL, 0, MPI_BYTE, i,
                                     COLL_TAG_BARRIER,
                                      comm
                                     );
        }
        smpi_mpi_waitall( size-1, requests+1, MPI_STATUSES_IGNORE );
        free( requests );
    }

    /* All done */

    return MPI_SUCCESS;

}
/* Non-topology-specific pipelined linear-reduce function */
int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf,
                                                 int count,
                                                 MPI_Datatype datatype,
                                                 MPI_Op op, int root,
                                                 MPI_Comm comm)
{
  int rank;
  rank = smpi_comm_rank(comm);

  int tag = -COLL_TAG_REDUCE;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Status temp_status_array[MAX_NODE];

  int size;
  int i;

  int sent_count;
  int header_index;
  int flag_array[MAX_NODE];
  int already_received[MAX_NODE];

  int header_buf[HEADER_SIZE];
  char temp_buf[MAX_NODE];

  MPI_Aint extent, lb;
  smpi_datatype_extent(datatype, &lb, &extent);

  /* source and destination */
  int to, from;

  size=smpi_comm_size(comm);
  rank=smpi_comm_rank(comm);


  /* segment is segment size in number of elements (not bytes) */
  int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < MAX_NODE; i++) {
    already_received[i] = 0;
  }

  char *tmp_buf;
  tmp_buf = (char *) xbt_malloc(count * extent);

  smpi_mpi_sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank,
               tag, comm, &status);



  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {

    if (rank == 0) {
      sent_count = 0;

      while (sent_count < (size - 1)) {

        for (i = 1; i < size; i++) {
          if (already_received[i] == 0) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                             MPI_STATUSES_IGNORE);
            simcall_process_sleep(0.0001);
            }
        }

        header_index = 0;
        /* recv 1-byte message */
        for (i = 0; i < size; i++) {
          if (i == rank)
            continue;

          /* 1-byte message arrive */
          if ((flag_array[i] == 1) && (already_received[i] == 0)) {
            smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;


            //printf("root send to %d recv from %d : data = ",to,from);
            /*
               for (i=0;i<=header_index;i++) {
               printf("%d ",header_buf[i]);
               }
               printf("\n");
             */
            /* will receive in the next step */
            already_received[i] = 1;
          }
        }

        /* send header followed by receive and reduce data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];
          from = header_buf[header_index - 1];

          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
          smpi_mpi_recv(tmp_buf, count, datatype, from, tag, comm, &status);
          smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype);
        }
      }                         /* while loop */
    }

    /* root */
    /* non-root */
    else {

      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header and data, forward when required */
      smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
               &status);
      //      smpi_mpi_recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* forward header */
      if (header_buf[myordering + 1] != -1) {
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }
      //printf("node %d ordering %d\n",rank,myordering);

      /* receive, reduce, and forward data */

      /* send only */
      if (myordering == 0) {
        if (header_buf[myordering + 1] == -1) {
          to = 0;
        } else {
          to = header_buf[myordering + 1];
        }
        smpi_mpi_send(rbuf, count, datatype, to, tag, comm);
      }

      /* recv, reduce, send */
      else {
        if (header_buf[myordering + 1] == -1) {
          to = 0;
        } else {
          to = header_buf[myordering + 1];
        }
        from = header_buf[myordering - 1];
        smpi_mpi_recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag,
                 comm, &status);
        smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype);
        smpi_mpi_send(rbuf, count, datatype, to, tag, comm);
      }
    }                           /* non-root */
  }
  /* pipeline bcast */
  else {
    //    printf("node %d start\n",rank);

    send_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    if (rank == 0) {
      sent_count = 0;

      int will_send[MAX_NODE];
      for (i = 0; i < MAX_NODE; i++)
        will_send[i] = 0;

      /* loop until all data are received (sent) */
      while (sent_count < (size - 1)) {
        int k;
        for (k = 0; k < 1; k++) {
          for (i = 1; i < size; i++) {
            //if (i == rank)
            //continue;
            if ((already_received[i] == 0) && (will_send[i] == 0)) {
                smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                         &temp_status_array[i]);
              if (flag_array[i] == 1) {
                will_send[i] = 1;
                smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
                         &status);
                //printf("recv from %d\n",i);
                i = 1;
              }
            }
          }
        }                       /* end of probing */

        header_index = 0;

        /* recv 1-byte message */
        for (i = 1; i < size; i++) {
          //if (i==rank)
          //continue;
          /* message arrived in this round (put in the header) */
          if ((will_send[i] == 1) && (already_received[i] == 0)) {
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_received[i] = 1;
          }
        }

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];

          /* send header */
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

          /* recv data - pipeline */
          from = header_buf[header_index - 1];
          for (i = 0; i < pipe_length; i++) {
            smpi_mpi_recv(tmp_buf + (i * increment), segment, datatype, from, tag,
                     comm, &status);
            smpi_op_apply(op, tmp_buf + (i * increment),
                           (char *)rbuf + (i * increment), &segment, &datatype);
          }
        }
      }                         /* while loop (sent_count < size-1 ) */
    }

    /* root */
    /* none root */
    else {
      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);


      /* wait for header forward when required */
      request=smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm);
      smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

      /* search for where it is */
      int myordering = 0;

      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header when required */
      if (header_buf[myordering + 1] != -1) {
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }

      /* (receive, reduce), and send data */
      if (header_buf[myordering + 1] == -1) {
        to = 0;
      } else {
        to = header_buf[myordering + 1];
      }

      /* send only */
      if (myordering == 0) {
        for (i = 0; i < pipe_length; i++) {
            send_request_array[i]= smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }

      /* receive, reduce, and send */
      else {
        from = header_buf[myordering - 1];
        for (i = 0; i < pipe_length; i++) {
          recv_request_array[i]=smpi_mpi_irecv(tmp_buf + (i * increment), segment, datatype, from, tag, comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
          smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment),
                         &segment, &datatype);
          send_request_array[i]=smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }
    }                           /* non-root */




    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);

    //printf("node %d done\n",rank);
  }                             /* end pipeline */


  /* if root is not zero send root after finished
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == 0) {
      smpi_mpi_send(rbuf, count, datatype, root, tag, comm);
    } else if (rank == root) {
      smpi_mpi_recv(rbuf, count, datatype, 0, tag, comm, &status);
    }
  }


  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    smpi_mpi_reduce((char *)buf + (pipe_length * increment),
	       (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root,
               comm);
  }

  free(tmp_buf);

  return MPI_SUCCESS;
}
Exemple #8
0
int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype,
                               int root, MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  int rank, size;
  int i;

  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  rank = smpi_comm_rank(MPI_COMM_WORLD);
  size = smpi_comm_size(MPI_COMM_WORLD);

  /* source node and destination nodes (same through out the functions) */
  int from = (rank - 1) / 2;
  int to_left = rank * 2 + 1;
  int to_right = rank * 2 + 2;
  if (to_left >= size)
    to_left = -1;
  if (to_right >= size)
    to_right = -1;

  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_NTSB_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {

    /* case: root */
    if (rank == 0) {
      /* case root has only a left child */
      if (to_right == -1) {
        smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
      }
      /* case root has both left and right children */
      else {
        smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
        smpi_mpi_send(buf, count, datatype, to_right, tag, comm);
      }
    }

    /* case: leaf ==> receive only */
    else if (to_left == -1) {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
    }

    /* case: intermidiate node with only left child ==> relay message */
    else if (to_right == -1) {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
      smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
    }

    /* case: intermidiate node with both left and right children ==> relay message */
    else {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
      smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
      smpi_mpi_send(buf, count, datatype, to_right, tag, comm);
    }
    return MPI_SUCCESS;
  }
  // pipelining
  else {

    send_request_array =
        (MPI_Request *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));



    /* case: root */
    if (rank == 0) {
      /* case root has only a left child */
      if (to_right == -1) {
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                    tag + i, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }
      /* case root has both left and right children */
      else {
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                    tag + i, comm);
          send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right,
                    tag + i, comm);
        }
        smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array);
      }
    }

    /* case: leaf ==> receive only */
    else if (to_left == -1) {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array);
    }

    /* case: intermidiate node with only left child ==> relay message */
    else if (to_right == -1) {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], &status);
        send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                  tag + i, comm);
      }
      smpi_mpi_waitall(pipe_length, send_request_array, send_status_array);

    }
    /* case: intermidiate node with both left and right children ==> relay message */
    else {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], &status);
        send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                  tag + i, comm);
        send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right,
                  tag + i, comm);
      }
      smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array);
    }

    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);
  }                             /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_NTSB use default MPI_bcast.");	  	  
    smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return MPI_SUCCESS;
}
/**
 * Alltoall Bruck
 *
 * Openmpi calls this routine when the message size sent to each rank < 2000 bytes and size < 12
 * FIXME: uh, check smpi_pmpi again, but this routine is called for > 12, not
 * less...
 **/
int smpi_coll_tuned_alltoallv_bruck(void *sendbuf, int *sendcounts, int *senddisps,
                                   MPI_Datatype sendtype, void *recvbuf,
                                   int *recvcounts, int *recvdisps, MPI_Datatype recvtype,
                                   MPI_Comm comm)
{
  int system_tag = COLL_TAG_ALLTOALLV;
  int i, rank, size, err, count;
  MPI_Aint lb;
  MPI_Aint sendext = 0;
  MPI_Aint recvext = 0;
  MPI_Request *requests;

  // FIXME: check implementation
  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  XBT_DEBUG("<%d> algorithm alltoall_bruck() called.", rank);

  smpi_datatype_extent(sendtype, &lb, &sendext);
  smpi_datatype_extent(recvtype, &lb, &recvext);
  /* Local copy from self */
  err =
      smpi_datatype_copy((char *)sendbuf + senddisps[rank] * sendext,
                         sendcounts[rank], sendtype,
                         (char *)recvbuf + recvdisps[rank] * recvext,
                         recvcounts[rank], recvtype);
  if (err == MPI_SUCCESS && size > 1) {
    /* Initiate all send/recv to/from others. */

      int bblock = 4;//MPIR_PARAM_ALLTOALL_THROTTLE
      //if (bblock == 0) bblock = comm_size;


     // requests = xbt_new(MPI_Request, 2 * (bblock - 1));
      int ii, ss, dst;
      /* post only bblock isends/irecvs at a time as suggested by Tony Ladd */
      for (ii=0; ii<size; ii+=bblock) {
          requests = xbt_new(MPI_Request, 2 * (bblock ));

          ss = size-ii < bblock ? size-ii : bblock;
          count = 0;

          /* do the communication -- post ss sends and receives: */
          for ( i=0; i<ss; i++ ) {
            dst = (rank+i+ii) % size;
              if (dst == rank) {
                XBT_DEBUG("<%d> skip request creation [src = %d, recvcount = %d]",
                       rank, i, recvcounts[dst]);
                continue;
              }

              requests[count]=smpi_mpi_irecv((char *)recvbuf + recvdisps[dst] * recvext, recvcounts[dst],
                                  recvtype, dst, system_tag, comm );
              count++;
            }
            /* Now create all sends  */
          for ( i=0; i<ss; i++ ) {
              dst = (rank-i-ii+size) % size;
              if (dst == rank) {
                XBT_DEBUG("<%d> skip request creation [dst = %d, sendcount = %d]",
                       rank, i, sendcounts[dst]);
                continue;
              }
              requests[count]=smpi_mpi_isend((char *)sendbuf + senddisps[dst] * sendext, sendcounts[dst],
                                  sendtype, dst, system_tag, comm);
              count++;
            }
            /* Wait for them all. */
            //smpi_mpi_startall(count, requests);
            XBT_DEBUG("<%d> wait for %d requests", rank, count);
            smpi_mpi_waitall(count, requests, MPI_STATUSES_IGNORE);
            xbt_free(requests);

          }

  }
  return MPI_SUCCESS;
}
Exemple #10
0
int smpi_coll_tuned_reduce_mvapich2_knomial (
        void *sendbuf,
        void *recvbuf,
        int count,
        MPI_Datatype datatype,
        MPI_Op op,
        int root,
        MPI_Comm comm)
{
    int mpi_errno = MPI_SUCCESS;
    int rank, is_commutative;
    int src, k;
    MPI_Request send_request;
    int index=0;
    MPI_Aint true_lb, true_extent, extent;
    MPI_Status status; 
    int recv_iter=0, dst=-1, expected_send_count, expected_recv_count;
    int *src_array=NULL;
    void **tmp_buf=NULL;
    MPI_Request *requests=NULL;


    if (count == 0) return MPI_SUCCESS;

    rank = smpi_comm_rank(comm);

    /* Create a temporary buffer */

    smpi_datatype_extent(datatype, &true_lb, &true_extent);
    extent = smpi_datatype_get_extent(datatype);

    is_commutative = smpi_op_is_commute(op);

    if (rank != root) {
        recvbuf=(void *)smpi_get_tmp_recvbuffer(count*(MAX(extent,true_extent)));
        recvbuf = (void *)((char*)recvbuf - true_lb);
    }

    if ((rank != root) || (sendbuf != MPI_IN_PLACE)) {
        mpi_errno = smpi_datatype_copy(sendbuf, count, datatype, recvbuf,
                count, datatype);
    }


    if(mv2_reduce_intra_knomial_factor<0)
      {
        mv2_reduce_intra_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }
    if(mv2_reduce_inter_knomial_factor<0)
      {
        mv2_reduce_inter_knomial_factor = SMPI_DEFAULT_KNOMIAL_FACTOR;
      }


    MPIR_Reduce_knomial_trace(root, mv2_reduce_intra_knomial_factor, comm, 
           &dst, &expected_send_count, &expected_recv_count, &src_array);

    if(expected_recv_count > 0 ) {
        tmp_buf  = xbt_malloc(sizeof(void *)*expected_recv_count);
        requests = xbt_malloc(sizeof(MPI_Request)*expected_recv_count);
        for(k=0; k < expected_recv_count; k++ ) {
            tmp_buf[k] = smpi_get_tmp_sendbuffer(count*(MAX(extent,true_extent)));
            tmp_buf[k] = (void *)((char*)tmp_buf[k] - true_lb);
        }

        while(recv_iter  < expected_recv_count) {
            src = src_array[expected_recv_count - (recv_iter+1)];

            requests[recv_iter]=smpi_mpi_irecv (tmp_buf[recv_iter], count, datatype ,src,
                    COLL_TAG_REDUCE, comm);
            recv_iter++;

        }

        recv_iter=0;
        while(recv_iter < expected_recv_count) {
            index=smpi_mpi_waitany(expected_recv_count, requests,
                    &status);
            recv_iter++;

            if (is_commutative) {
              smpi_op_apply(op, tmp_buf[index], recvbuf, &count, &datatype);
            }
        }

        for(k=0; k < expected_recv_count; k++ ) {
            smpi_free_tmp_buffer(tmp_buf[k]);
        }
        xbt_free(tmp_buf);
        xbt_free(requests);
    }

    if(src_array != NULL) { 
        xbt_free(src_array);
    } 

    if(rank != root) {
        send_request=smpi_mpi_isend(recvbuf,count, datatype, dst,
                COLL_TAG_REDUCE,comm);

        smpi_mpi_waitall(1, &send_request, &status);

        smpi_free_tmp_buffer((void *)((char*)recvbuf + true_lb));
    }

    /* --END ERROR HANDLING-- */

    return mpi_errno;
}
int smpi_coll_tuned_alltoall_mvapich2_scatter_dest(
                            void *sendbuf,
                            int sendcount,
                            MPI_Datatype sendtype,
                            void *recvbuf,
                            int recvcount,
                            MPI_Datatype recvtype,
                            MPI_Comm comm)
{
    int          comm_size, i, j;
    MPI_Aint     sendtype_extent = 0, recvtype_extent = 0;
    int mpi_errno=MPI_SUCCESS;
    int dst, rank;
    MPI_Request *reqarray;
    MPI_Status *starray;
    
    if (recvcount == 0) return MPI_SUCCESS;
    
    comm_size =  smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    
    /* Get extent of send and recv types */
    recvtype_extent = smpi_datatype_get_extent(recvtype);
    sendtype_extent = smpi_datatype_get_extent(sendtype);
    
    /* Medium-size message. Use isend/irecv with scattered
     destinations. Use Tony Ladd's modification to post only
     a small number of isends/irecvs at a time. */
    /* FIXME: This converts the Alltoall to a set of blocking phases.
     Two alternatives should be considered:
     1) the choice of communication pattern could try to avoid
     contending routes in each phase
     2) rather than wait for all communication to finish (waitall),
     we could maintain constant queue size by using waitsome
     and posting new isend/irecv as others complete.  This avoids
     synchronization delays at the end of each block (when
     there are only a few isend/irecvs left)
     */
    int ii, ss, bblock;
        
    //Stampede is configured with
    bblock = MV2_ALLTOALL_THROTTLE_FACTOR;//mv2_coll_param.alltoall_throttle_factor;
    
    if (bblock >= comm_size) bblock = comm_size;
    /* If throttle_factor is n, each process posts n pairs of isend/irecv
     in each iteration. */
    
    /* FIXME: This should use the memory macros (there are storage
     leaks here if there is an error, for example) */
    reqarray= (MPI_Request*)xbt_malloc(2*bblock*sizeof(MPI_Request));
    
    starray=(MPI_Status *)xbt_malloc(2*bblock*sizeof(MPI_Status));
 
    for (ii=0; ii<comm_size; ii+=bblock) {
        ss = comm_size-ii < bblock ? comm_size-ii : bblock;
        /* do the communication -- post ss sends and receives: */
        for ( i=0; i<ss; i++ ) {
            dst = (rank+i+ii) % comm_size;
            reqarray[i]=smpi_mpi_irecv((char *)recvbuf +
                                      dst*recvcount*recvtype_extent,
                                      recvcount, recvtype, dst,
                                      COLL_TAG_ALLTOALL, comm);

        }
        for ( i=0; i<ss; i++ ) {
            dst = (rank-i-ii+comm_size) % comm_size;
            reqarray[i+ss]=smpi_mpi_isend((char *)sendbuf +
                                          dst*sendcount*sendtype_extent,
                                          sendcount, sendtype, dst,
                                          COLL_TAG_ALLTOALL, comm);

        }
        
        /* ... then wait for them to finish: */
        smpi_mpi_waitall(2*ss,reqarray,starray);
        
       
        /* --BEGIN ERROR HANDLING-- */
        if (mpi_errno == MPI_ERR_IN_STATUS) {
                for (j=0; j<2*ss; j++) {
                     if (starray[j].MPI_ERROR != MPI_SUCCESS) {
                         mpi_errno = starray[j].MPI_ERROR;
                     }
                }
        }
    }
    /* --END ERROR HANDLING-- */
    xbt_free(starray);
    xbt_free(reqarray);
    return (mpi_errno);
    
}
int smpi_coll_tuned_bcast_ompi_pipeline( void* buffer,
                                      int original_count, 
                                      MPI_Datatype datatype, 
                                      int root,
                                      MPI_Comm comm)
{
    int count_by_segment = original_count;
    size_t type_size;
    int segsize =1024  << 7;
    //mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module;
    //mca_coll_tuned_comm_t *data = tuned_module->tuned_data;
    
//    return ompi_coll_tuned_bcast_intra_generic( buffer, count, datatype, root, comm, module,
//                                                count_by_segment, data->cached_pipeline );
    ompi_coll_tree_t * tree = ompi_coll_tuned_topo_build_chain( 1, comm, root );
    int i;
    int rank, size;
    int segindex;
    int num_segments; /* Number of segments */
    int sendcount;    /* number of elements sent in this segment */ 
    size_t realsegsize;
    char *tmpbuf;
    ptrdiff_t extent;
    MPI_Request recv_reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    MPI_Request *send_reqs = NULL;
    int req_index;
    
    /**
     * Determine number of elements sent per operation.
     */
    type_size = smpi_datatype_size(datatype);

    size = smpi_comm_size(comm);
    rank = smpi_comm_rank(comm);
    xbt_assert( size > 1 );


    const double a_p16  = 3.2118e-6; /* [1 / byte] */
    const double b_p16  = 8.7936;   
    const double a_p64  = 2.3679e-6; /* [1 / byte] */
    const double b_p64  = 1.1787;     
    const double a_p128 = 1.6134e-6; /* [1 / byte] */
    const double b_p128 = 2.1102;
    size_t message_size;

    /* else we need data size for decision function */
    message_size = type_size * (unsigned long)original_count;   /* needed for decision */

    if (size < (a_p128 * message_size + b_p128)) {
            //Pipeline with 128KB segments 
            segsize = 1024  << 7;
    }else if (size < (a_p64 * message_size + b_p64)) {
            // Pipeline with 64KB segments 
            segsize = 1024 << 6;
    }else if (size < (a_p16 * message_size + b_p16)) {
            //Pipeline with 16KB segments 
            segsize = 1024 << 4;
    }

    COLL_TUNED_COMPUTED_SEGCOUNT( segsize, type_size, count_by_segment );

    XBT_DEBUG("coll:tuned:bcast_intra_pipeline rank %d ss %5d type_size %lu count_by_segment %d",
                 smpi_comm_rank(comm), segsize, (unsigned long)type_size, count_by_segment);



    extent = smpi_datatype_get_extent (datatype);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    realsegsize = count_by_segment * extent;
    
    /* Set the buffer pointers */
    tmpbuf = (char *) buffer;

    if( tree->tree_nextsize != 0 ) {
        send_reqs = xbt_new(MPI_Request, tree->tree_nextsize  );
    }

    /* Root code */
    if( rank == root ) {
        /* 
           For each segment:
           - send segment to all children.
             The last segment may have less elements than other segments.
        */
        sendcount = count_by_segment;
        for( segindex = 0; segindex < num_segments; segindex++ ) {
            if( segindex == (num_segments - 1) ) {
                sendcount = original_count - segindex * count_by_segment;
            }
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm);
           } 

            /* complete the sends before starting the next sends */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );

            /* update tmp buffer */
            tmpbuf += realsegsize;

        }
    } 
    
    /* Intermediate nodes code */
    else if( tree->tree_nextsize > 0 ) { 
        /* 
           Create the pipeline. 
           1) Post the first receive
           2) For segments 1 .. num_segments
              - post new receive
              - wait on the previous receive to complete
              - send this data to children
           3) Wait on the last segment
           4) Compute number of elements in last segment.
           5) Send the last segment to children
         */
        req_index = 0;
        recv_reqs[req_index]=smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                           tree->tree_prev, COLL_TAG_BCAST,
                           comm);
        
        for( segindex = 1; segindex < num_segments; segindex++ ) {
            
            req_index = req_index ^ 0x1;
            
            /* post new irecv */
            recv_reqs[req_index]= smpi_mpi_irecv( tmpbuf + realsegsize, count_by_segment,
                                datatype, tree->tree_prev, 
                                COLL_TAG_BCAST,
                                comm);
            
            /* wait for and forward the previous segment to children */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUSES_IGNORE );
            
            for( i = 0; i < tree->tree_nextsize; i++ ) { 
                send_reqs[i]=smpi_mpi_isend(tmpbuf, count_by_segment, datatype,
                                         tree->tree_next[i], 
                                         COLL_TAG_BCAST, comm );
            } 
            
            /* complete the sends before starting the next iteration */
            smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                         MPI_STATUSES_IGNORE );
            
            /* Update the receive buffer */
            tmpbuf += realsegsize;
        }

        /* Process the last segment */
        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUSES_IGNORE );
        sendcount = original_count - (num_segments - 1) * count_by_segment;
        for( i = 0; i < tree->tree_nextsize; i++ ) {
            send_reqs[i] = smpi_mpi_isend(tmpbuf, sendcount, datatype,
                                     tree->tree_next[i], 
                                     COLL_TAG_BCAST, comm);
        }
        
        smpi_mpi_waitall( tree->tree_nextsize, send_reqs, 
                                     MPI_STATUSES_IGNORE );
    }
  
    /* Leaf nodes */
    else {
        /* 
           Receive all segments from parent in a loop:
           1) post irecv for the first segment
           2) for segments 1 .. num_segments
              - post irecv for the next segment
              - wait on the previous segment to arrive
           3) wait for the last segment
        */
        req_index = 0;
        recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype,
                                 tree->tree_prev, COLL_TAG_BCAST,
                                 comm);

        for( segindex = 1; segindex < num_segments; segindex++ ) {
            req_index = req_index ^ 0x1;
            tmpbuf += realsegsize;
            /* post receive for the next segment */
            recv_reqs[req_index] = smpi_mpi_irecv(tmpbuf, count_by_segment, datatype, 
                                     tree->tree_prev, COLL_TAG_BCAST,
                                     comm);
            /* wait on the previous segment */
            smpi_mpi_wait( &recv_reqs[req_index ^ 0x1], 
                                     MPI_STATUS_IGNORE );
        }

        smpi_mpi_wait( &recv_reqs[req_index], MPI_STATUS_IGNORE );
    }

    if( NULL != send_reqs ) free(send_reqs);

    return (MPI_SUCCESS);
}
int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
    smpi_comm_init_smp(comm);
  }
  int host_num_core=1;
  if (smpi_comm_is_uniform(comm)){
    host_num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  }else{
    //implementation buggy in this case
    return smpi_coll_tuned_bcast_mpich( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline 
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm);
        smpi_mpi_wait(&request, &status);
      }
      // case intermediate
      else {
        request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        smpi_mpi_waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");	  
    smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
int 
smpi_coll_tuned_allreduce_ompi_ring_segmented(void *sbuf, void *rbuf, int count,
                                               MPI_Datatype dtype,
                                               MPI_Op op,
                                               MPI_Comm comm) 
{
   int ret = MPI_SUCCESS;
   int line;
   int k, recv_from, send_to;
   int early_blockcount, late_blockcount, split_rank; 
   int segcount, max_segcount;
   int num_phases, phase;
   int block_count;
   unsigned int inbi;
   size_t typelng;
   char *tmpsend = NULL, *tmprecv = NULL;
   char *inbuf[2] = {NULL, NULL};
   ptrdiff_t true_extent, extent;
   ptrdiff_t block_offset, max_real_segsize;
   MPI_Request reqs[2] = {NULL, NULL};
   const size_t segsize = 1 << 20; /* 1 MB */
   unsigned int size = smpi_comm_size(comm);
   unsigned int rank = smpi_comm_rank(comm);

   XBT_DEBUG("coll:tuned:allreduce_intra_ring_segmented rank %d, count %d", rank, count);

   /* Special case for size == 1 */
   if (1 == size) {
      if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
         if (ret < 0) { line = __LINE__; goto error_hndl; }
      }
      return MPI_SUCCESS;
   }
   
   /* Determine segment count based on the suggested segment size */
   extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   true_extent = smpi_datatype_get_extent(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   typelng = smpi_datatype_size(dtype);
   if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
   segcount = count;
   COLL_TUNED_COMPUTED_SEGCOUNT(segsize, typelng, segcount)

   /* Special case for count less than size * segcount - use regular ring */
   if (count < size * segcount) {
      XBT_DEBUG( "coll:tuned:allreduce_ring_segmented rank %d/%d, count %d, switching to regular ring", rank, size, count);
      return (smpi_coll_tuned_allreduce_lr(sbuf, rbuf, count, dtype, op, 
                                                   comm));
   }

   /* Determine the number of phases of the algorithm */
   num_phases = count / (size * segcount);
   if ((count % (size * segcount) >= size) && 
       (count % (size * segcount) > ((size * segcount) / 2))) {
      num_phases++;
   }

   /* Determine the number of elements per block and corresponding 
      block sizes.
      The blocks are divided into "early" and "late" ones:
      blocks 0 .. (split_rank - 1) are "early" and 
      blocks (split_rank) .. (size - 1) are "late".
      Early blocks are at most 1 element larger than the late ones.
      Note, these blocks will be split into num_phases segments,
      out of the largest one will have max_segcount elements.
    */
   COLL_TUNED_COMPUTE_BLOCKCOUNT( count, size, split_rank, 
                                  early_blockcount, late_blockcount )
   COLL_TUNED_COMPUTE_BLOCKCOUNT( early_blockcount, num_phases, inbi,
                                  max_segcount, k)
   max_real_segsize = true_extent + (max_segcount - 1) * extent;

   /* Allocate and initialize temporary buffers */
   inbuf[0] = (char*)smpi_get_tmp_sendbuffer(max_real_segsize);
   if (NULL == inbuf[0]) { ret = -1; line = __LINE__; goto error_hndl; }
   if (size > 2) {
      inbuf[1] = (char*)smpi_get_tmp_recvbuffer(max_real_segsize);
      if (NULL == inbuf[1]) { ret = -1; line = __LINE__; goto error_hndl; }
   }

   /* Handle MPI_IN_PLACE */
   if (MPI_IN_PLACE != sbuf) {
      ret= smpi_datatype_copy(sbuf, count, dtype,rbuf, count, dtype);
      if (ret < 0) { line = __LINE__; goto error_hndl; }
   }

   /* Computation loop: for each phase, repeat ring allreduce computation loop */
   for (phase = 0; phase < num_phases; phase ++) {
      ptrdiff_t phase_offset;
      int early_phase_segcount, late_phase_segcount, split_phase, phase_count;

      /* 
         For each of the remote nodes:
         - post irecv for block (r-1)
         - send block (r)
           To do this, first compute block offset and count, and use block offset
           to compute phase offset.
         - in loop for every step k = 2 .. n
           - post irecv for block (r + n - k) % n
           - wait on block (r + n - k + 1) % n to arrive
           - compute on block (r + n - k + 1) % n
           - send block (r + n - k + 1) % n
         - wait on block (r + 1)
         - compute on block (r + 1)
         - send block (r + 1) to rank (r + 1)
         Note that we must be careful when computing the begining of buffers and
         for send operations and computation we must compute the exact block size.
      */
      send_to = (rank + 1) % size;
      recv_from = (rank + size - 1) % size;
      
      inbi = 0;
      /* Initialize first receive from the neighbor on the left */
      reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
      /* Send first block (my block) to the neighbor on the right:
         - compute my block and phase offset
         - send data */
      block_offset = ((rank < split_rank)? 
                      (rank * early_blockcount) : 
                      (rank * late_blockcount + split_rank));
      block_count = ((rank < split_rank)? early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmpsend = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_mpi_send(tmpsend, phase_count, dtype, send_to,
                              666, comm);
      
      for (k = 2; k < size; k++) {
         const int prevblock = (rank + size - k + 1) % size;
         
         inbi = inbi ^ 0x1;
         
         /* Post irecv for the current block */
         reqs[inbi] = smpi_mpi_irecv(inbuf[inbi], max_segcount, dtype, recv_from,
                               666, comm);
         if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; }
         
         /* Wait on previous block to arrive */
         smpi_mpi_wait(&reqs[inbi ^ 0x1], MPI_STATUS_IGNORE);
         
         /* Apply operation on previous block: result goes to rbuf
            rbuf[prevblock] = inbuf[inbi ^ 0x1] (op) rbuf[prevblock]
         */
         block_offset = ((prevblock < split_rank)?
                         (prevblock * early_blockcount) :
                         (prevblock * late_blockcount + split_rank));
         block_count = ((prevblock < split_rank)? 
                        early_blockcount : late_blockcount);
         COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                       early_phase_segcount, late_phase_segcount)
         phase_count = ((phase < split_phase)?
                        (early_phase_segcount) : (late_phase_segcount));
         phase_offset = ((phase < split_phase)?
                         (phase * early_phase_segcount) : 
                         (phase * late_phase_segcount + split_phase));
         tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
         smpi_op_apply(op, inbuf[inbi ^ 0x1], tmprecv, &phase_count, &dtype);
         /* send previous block to send_to */
         smpi_mpi_send(tmprecv, phase_count, dtype, send_to,
                              666, comm);
      }
      
      /* Wait on the last block to arrive */
      smpi_mpi_wait(&reqs[inbi], MPI_STATUS_IGNORE);

      
      /* Apply operation on the last block (from neighbor (rank + 1) 
         rbuf[rank+1] = inbuf[inbi] (op) rbuf[rank + 1] */
      recv_from = (rank + 1) % size;
      block_offset = ((recv_from < split_rank)?
                      (recv_from * early_blockcount) :
                      (recv_from * late_blockcount + split_rank));
      block_count = ((recv_from < split_rank)? 
                     early_blockcount : late_blockcount);
      COLL_TUNED_COMPUTE_BLOCKCOUNT(block_count, num_phases, split_phase,
                                    early_phase_segcount, late_phase_segcount)
      phase_count = ((phase < split_phase)?
                     (early_phase_segcount) : (late_phase_segcount));
      phase_offset = ((phase < split_phase)?
                      (phase * early_phase_segcount) : 
                      (phase * late_phase_segcount + split_phase));
      tmprecv = ((char*)rbuf) + (block_offset + phase_offset) * extent;
      smpi_op_apply(op, inbuf[inbi], tmprecv, &phase_count, &dtype);
   }

   /* Distribution loop - variation of ring allgather */
   send_to = (rank + 1) % size;
   recv_from = (rank + size - 1) % size;
   for (k = 0; k < size - 1; k++) {
      const int recv_data_from = (rank + size - k) % size;
      const int send_data_from = (rank + 1 + size - k) % size;
      const int send_block_offset = 
         ((send_data_from < split_rank)?
          (send_data_from * early_blockcount) :
          (send_data_from * late_blockcount + split_rank));
      const int recv_block_offset = 
         ((recv_data_from < split_rank)?
          (recv_data_from * early_blockcount) :
          (recv_data_from * late_blockcount + split_rank));
      block_count = ((send_data_from < split_rank)? 
                     early_blockcount : late_blockcount);

      tmprecv = (char*)rbuf + recv_block_offset * extent;
      tmpsend = (char*)rbuf + send_block_offset * extent;

      smpi_mpi_sendrecv(tmpsend, block_count, dtype, send_to,
                                     666,
                                     tmprecv, early_blockcount, dtype, recv_from,
                                     666,
                                     comm, MPI_STATUS_IGNORE);

   }

   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);

   return MPI_SUCCESS;

 error_hndl:
   XBT_DEBUG("%s:%4d\tRank %d Error occurred %d\n",
                __FILE__, line, rank, ret);
   if (NULL != inbuf[0]) smpi_free_tmp_buffer(inbuf[0]);
   if (NULL != inbuf[1]) smpi_free_tmp_buffer(inbuf[1]);
   return ret;
}
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_pattern_aware(void *buf, int count,
                                                MPI_Datatype datatype, int root,
                                                MPI_Comm comm)
{
  int tag = -COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Status temp_status_array[MAX_NODE];

  int rank, size;
  int i, j;

  int sent_count;
  int header_index;
  int flag_array[MAX_NODE];
  int already_sent[MAX_NODE];
  int to_clean[MAX_NODE];
  int header_buf[HEADER_SIZE];
  char temp_buf[MAX_NODE];

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  /* destination */
  int to;



  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);


  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_NTSL_segment_size_in_byte / extent;
  segment =  segment == 0 ? 1 :segment; 
  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }

  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < MAX_NODE; i++) {
    already_sent[i] = 0;
    to_clean[i] = 0;
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {
    if (rank == 0) {
      sent_count = 0;

      while (sent_count < (size - 1)) {
        for (i = 1; i < size; i++) {
          smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                     MPI_STATUSES_IGNORE);
        }

        header_index = 0;
        /* recv 1-byte message */
        for (i = 1; i < size; i++) {

          /* message arrive */
          if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
            smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, comm, &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_sent[i] = 1;
          }
        }

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
          smpi_mpi_send(buf, count, datatype, to, tag, comm);
        }

        /* randomly MPI_Send to one */
        else {
          /* search for the first node that never received data before */
          for (i = 1; i < size; i++) {
            if (already_sent[i] == 0) {
              header_buf[0] = i;
              header_buf[1] = -1;
              smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm);
              smpi_mpi_send(buf, count, datatype, i, tag, comm);
              already_sent[i] = 1;
              sent_count++;
              break;
            }
          }
        }


      }                         /* while loop */
    }

    /* non-root */
    else {

      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header and data, forward when required */
      smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
               &status);
      smpi_mpi_recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header followed by data */
      if (header_buf[myordering + 1] != -1) {
        smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
        smpi_mpi_send(buf, count, datatype, header_buf[myordering + 1], tag, comm);
      }
    }
  }
  /* pipeline bcast */
  else {
    send_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    if (rank == 0) {
      //double start2 = MPI_Wtime();
      sent_count = 0;
      //int iteration = 0;
      while (sent_count < (size - 1)) {
        //iteration++;
        //start = MPI_Wtime();
        for (i = 1; i < size; i++) {
          smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                     &temp_status_array[i]);
        }
        //total = MPI_Wtime() - start;
        //total *= 1000;
        //printf("Iprobe time = %.2f\n",total);
        header_index = 0;

        MPI_Wtime();
        /* recv 1-byte message */
        for (i = 1; i < size; i++) {
          /* message arrive */
          if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
            smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                     &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_sent[i] = 1;
          }
        }
        //total = MPI_Wtime() - start;
        //total *= 1000;
        //printf("Recv 1-byte time = %.2f\n",total);

        /*
           if (header_index != 0) {
           printf("header index = %d node = ",header_index);
           for (i=0;i<header_index;i++) {
           printf("%d ",header_buf[i]);
           }
           printf("\n");
           }
         */

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];

          //start = MPI_Wtime();

          /* send header */
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

          //total = MPI_Wtime() - start;
          //total *= 1000;
          //printf("\tSend header to %d time = %.2f\n",to,total);

          //start = MPI_Wtime();

          /* send data - non-pipeline case */

          if (0 == 1) {
            //if (header_index == 1) {
            smpi_mpi_send(buf, count, datatype, to, tag, comm);
          }


          /* send data - pipeline */
          else {
            for (i = 0; i < pipe_length; i++) {
              smpi_mpi_send((char *)buf + (i * increment), segment, datatype, to, tag, comm);
            }
            //smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
          }
          //total = MPI_Wtime() - start;
          //total *= 1000;
          //printf("\tSend data to %d time = %.2f\n",to,total);

        }



        /* randomly MPI_Send to one node */
        else {
          /* search for the first node that never received data before */
          for (i = 1; i < size; i++) {
            if (already_sent[i] == 0) {
              header_buf[0] = i;
              header_buf[1] = -1;
              to = i;

              //start = MPI_Wtime();
              smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

              /* still need to chop data so that we can use the same non-root code */
              for (j = 0; j < pipe_length; j++) {
                smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag,
                         comm);
              }

              //smpi_mpi_send(buf,count,datatype,to,tag,comm);
              //smpi_mpi_wait(&request,MPI_STATUS_IGNORE);

              //total = MPI_Wtime() - start;
              //total *= 1000;
              //printf("SEND TO SINGLE node %d time = %.2f\n",i,total);


              already_sent[i] = 1;
              to_clean[i]=1;
              sent_count++;
              break;
            }
          }
        }

      }                         /* while loop */

      for(i=0; i<size; i++)
        if(to_clean[i]!=0)smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                     &status);
      //total = MPI_Wtime() - start2;
      //total *= 1000;
      //printf("Node zero iter = %d time = %.2f\n",iteration,total);
    }

    /* rank 0 */
    /* none root */
    else {
      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header forward when required */
      request = smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm);
      smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header when required */
      if (header_buf[myordering + 1] != -1) {
        smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }

      /* receive data */

      if (0 == -1) {
        //if (header_buf[1] == -1) {
        request = smpi_mpi_irecv(buf, count, datatype, 0, tag, comm);
        smpi_mpi_wait(&request, MPI_STATUS_IGNORE);
        //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering);
      } else {
        for (i = 0; i < pipe_length; i++) {
          recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE,
                                                 tag, comm);
        }
      }

      /* send data */
      if (header_buf[myordering + 1] != -1) {
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
          send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype,
                    header_buf[myordering + 1], tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }else{
          smpi_mpi_waitall(pipe_length, recv_request_array, recv_status_array);
          }
    
    }

    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);
  }                             /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_arrival_pattern_aware use default MPI_bcast.");	  
    smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
  }

  return MPI_SUCCESS;
}
Exemple #16
0
/*
 *  reduce_scatter_ompi_basic_recursivehalving
 *
 *  Function:   - reduce scatter implementation using recursive-halving 
 *                algorithm
 *  Accepts:    - same as MPI_Reduce_scatter()
 *  Returns:    - MPI_SUCCESS or error code
 *  Limitation: - Works only for commutative operations.
 */
int
smpi_coll_tuned_reduce_scatter_ompi_basic_recursivehalving(void *sbuf, 
                                                            void *rbuf, 
                                                            int *rcounts,
                                                            MPI_Datatype dtype,
                                                            MPI_Op op,
                                                            MPI_Comm comm
                                                            )
{
    int i, rank, size, count, err = MPI_SUCCESS;
    int tmp_size=1, remain = 0, tmp_rank, *disps = NULL;
    ptrdiff_t true_lb, true_extent, lb, extent, buf_size;
    char *recv_buf = NULL, *recv_buf_free = NULL;
    char *result_buf = NULL, *result_buf_free = NULL;
   
    /* Initialize */
    rank = smpi_comm_rank(comm);
    size = smpi_comm_size(comm);
   
    XBT_DEBUG("coll:tuned:reduce_scatter_ompi_basic_recursivehalving, rank %d", rank);

    /* Find displacements and the like */
    disps = (int*) xbt_malloc(sizeof(int) * size);
    if (NULL == disps) return MPI_ERR_OTHER;

    disps[0] = 0;
    for (i = 0; i < (size - 1); ++i) {
        disps[i + 1] = disps[i] + rcounts[i];
    }
    count = disps[size - 1] + rcounts[size - 1];

    /* short cut the trivial case */
    if (0 == count) {
        xbt_free(disps);
        return MPI_SUCCESS;
    }

    /* get datatype information */
    smpi_datatype_extent(dtype, &lb, &extent);
    smpi_datatype_extent(dtype, &true_lb, &true_extent);
    buf_size = true_extent + (ptrdiff_t)(count - 1) * extent;

    /* Handle MPI_IN_PLACE */
    if (MPI_IN_PLACE == sbuf) {
        sbuf = rbuf;
    }

    /* Allocate temporary receive buffer. */
    recv_buf_free = (char*) xbt_malloc(buf_size);
    recv_buf = recv_buf_free - lb;
    if (NULL == recv_buf_free) {
        err = MPI_ERR_OTHER;
        goto cleanup;
    }
   
    /* allocate temporary buffer for results */
    result_buf_free = (char*) xbt_malloc(buf_size);
    result_buf = result_buf_free - lb;
   
    /* copy local buffer into the temporary results */
    err =smpi_datatype_copy(sbuf, count, dtype, result_buf, count, dtype);
    if (MPI_SUCCESS != err) goto cleanup;
   
    /* figure out power of two mapping: grow until larger than
       comm size, then go back one, to get the largest power of
       two less than comm size */
    while (tmp_size <= size) tmp_size <<= 1;
    tmp_size >>= 1;
    remain = size - tmp_size;
   
    /* If comm size is not a power of two, have the first "remain"
       procs with an even rank send to rank + 1, leaving a power of
       two procs to do the rest of the algorithm */
    if (rank < 2 * remain) {
        if ((rank & 1) == 0) {
            smpi_mpi_send(result_buf, count, dtype, rank + 1, 
                                    COLL_TAG_REDUCE_SCATTER,
                                    comm);
            /* we don't participate from here on out */
            tmp_rank = -1;
        } else {
            smpi_mpi_recv(recv_buf, count, dtype, rank - 1,
                                    COLL_TAG_REDUCE_SCATTER,
                                    comm, MPI_STATUS_IGNORE);
         
            /* integrate their results into our temp results */
            smpi_op_apply(op, recv_buf, result_buf, &count, &dtype);
         
            /* adjust rank to be the bottom "remain" ranks */
            tmp_rank = rank / 2;
        }
    } else {
        /* just need to adjust rank to show that the bottom "even
           remain" ranks dropped out */
        tmp_rank = rank - remain;
    }
   
    /* For ranks not kicked out by the above code, perform the
       recursive halving */
    if (tmp_rank >= 0) {
        int *tmp_disps = NULL, *tmp_rcounts = NULL;
        int mask, send_index, recv_index, last_index;
      
        /* recalculate disps and rcounts to account for the
           special "remainder" processes that are no longer doing
           anything */
        tmp_rcounts = (int*) xbt_malloc(tmp_size * sizeof(int));
        if (NULL == tmp_rcounts) {
            err = MPI_ERR_OTHER;
            goto cleanup;
        }
        tmp_disps = (int*) xbt_malloc(tmp_size * sizeof(int));
        if (NULL == tmp_disps) {
            xbt_free(tmp_rcounts);
            err = MPI_ERR_OTHER;
            goto cleanup;
        }

        for (i = 0 ; i < tmp_size ; ++i) {
            if (i < remain) {
                /* need to include old neighbor as well */
                tmp_rcounts[i] = rcounts[i * 2 + 1] + rcounts[i * 2];
            } else {
                tmp_rcounts[i] = rcounts[i + remain];
            }
        }

        tmp_disps[0] = 0;
        for (i = 0; i < tmp_size - 1; ++i) {
            tmp_disps[i + 1] = tmp_disps[i] + tmp_rcounts[i];
        }

        /* do the recursive halving communication.  Don't use the
           dimension information on the communicator because I
           think the information is invalidated by our "shrinking"
           of the communicator */
        mask = tmp_size >> 1;
        send_index = recv_index = 0;
        last_index = tmp_size;
        while (mask > 0) {
            int tmp_peer, peer, send_count, recv_count;
            MPI_Request request;

            tmp_peer = tmp_rank ^ mask;
            peer = (tmp_peer < remain) ? tmp_peer * 2 + 1 : tmp_peer + remain;

            /* figure out if we're sending, receiving, or both */
            send_count = recv_count = 0;
            if (tmp_rank < tmp_peer) {
                send_index = recv_index + mask;
                for (i = send_index ; i < last_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < send_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            } else {
                recv_index = send_index + mask;
                for (i = send_index ; i < recv_index ; ++i) {
                    send_count += tmp_rcounts[i];
                }
                for (i = recv_index ; i < last_index ; ++i) {
                    recv_count += tmp_rcounts[i];
                }
            }

            /* actual data transfer.  Send from result_buf,
               receive into recv_buf */
            if (send_count > 0 && recv_count != 0) {
                request=smpi_mpi_irecv(recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                                         recv_count, dtype, peer,
                                         COLL_TAG_REDUCE_SCATTER,
                                         comm);
                if (MPI_SUCCESS != err) {
                    xbt_free(tmp_rcounts);
                    xbt_free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (recv_count > 0 && send_count != 0) {
                smpi_mpi_send(result_buf + (ptrdiff_t)tmp_disps[send_index] * extent,
                                        send_count, dtype, peer, 
                                        COLL_TAG_REDUCE_SCATTER,
                                        comm);
                if (MPI_SUCCESS != err) {
                    xbt_free(tmp_rcounts);
                    xbt_free(tmp_disps);
                    goto cleanup;
                }                                             
            }
            if (send_count > 0 && recv_count != 0) {
                smpi_mpi_wait(&request, MPI_STATUS_IGNORE);
            }

            /* if we received something on this step, push it into
               the results buffer */
            if (recv_count > 0) {
                smpi_op_apply(op, 
                               recv_buf + (ptrdiff_t)tmp_disps[recv_index] * extent, 
                               result_buf + (ptrdiff_t)tmp_disps[recv_index] * extent,
                               &recv_count, &dtype);
            }

            /* update for next iteration */
            send_index = recv_index;
            last_index = recv_index + mask;
            mask >>= 1;
        }

        /* copy local results from results buffer into real receive buffer */
        if (0 != rcounts[rank]) {
            err = smpi_datatype_copy(result_buf + disps[rank] * extent,
                                       rcounts[rank], dtype, 
                                       rbuf, rcounts[rank], dtype);
            if (MPI_SUCCESS != err) {
                xbt_free(tmp_rcounts);
                xbt_free(tmp_disps);
                goto cleanup;
            }                                             
        }

        xbt_free(tmp_rcounts);
        xbt_free(tmp_disps);
    }
Exemple #17
0
/**
 * This is a generic implementation of the reduce protocol. It used the tree
 * provided as an argument and execute all operations using a segment of
 * count times a datatype.
 * For the last communication it will update the count in order to limit
 * the number of datatype to the original count (original_count)
 *
 * Note that for non-commutative operations we cannot save memory copy
 * for the first block: thus we must copy sendbuf to accumbuf on intermediate 
 * to keep the optimized loop happy.
 */
int smpi_coll_tuned_ompi_reduce_generic( void* sendbuf, void* recvbuf, int original_count,
                                    MPI_Datatype datatype, MPI_Op  op,
                                    int root, MPI_Comm comm,
                                    ompi_coll_tree_t* tree, int count_by_segment,
                                    int max_outstanding_reqs )
{
    char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL};
    char *accumbuf = NULL, *accumbuf_free = NULL;
    char *local_op_buffer = NULL, *sendtmpbuf = NULL;
    ptrdiff_t extent, lower_bound, segment_increment;
    MPI_Request  reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
    int num_segments, line, ret, segindex, i, rank;
    int recvcount, prevcount, inbi;

    /**
     * Determine number of segments and number of elements
     * sent per operation
     */
    smpi_datatype_extent( datatype, &lower_bound, &extent);
    num_segments = (original_count + count_by_segment - 1) / count_by_segment;
    segment_increment = count_by_segment * extent;

    sendtmpbuf = (char*) sendbuf; 
    if( sendbuf == MPI_IN_PLACE ) { 
        sendtmpbuf = (char *)recvbuf; 
    }

    XBT_DEBUG( "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)(num_segments * segment_increment), (unsigned long)segment_increment, max_outstanding_reqs);

    rank = smpi_comm_rank(comm);

    /* non-leaf nodes - wait for children to send me data & forward up 
       (if needed) */
    if( tree->tree_nextsize > 0 ) {
        ptrdiff_t true_extent, real_segment_size;
        true_extent=smpi_datatype_get_extent( datatype);

        /* handle non existant recv buffer (i.e. its NULL) and 
           protect the recv buffer on non-root nodes */
        accumbuf = (char*)recvbuf;
        if( (NULL == accumbuf) || (root != rank) ) {
            /* Allocate temporary accumulator buffer. */
            accumbuf_free = (char*)malloc(true_extent + 
                                          (original_count - 1) * extent);
            if (accumbuf_free == NULL) { 
                line = __LINE__; ret = -1; goto error_hndl; 
            }
            accumbuf = accumbuf_free - lower_bound;
        } 

        /* If this is a non-commutative operation we must copy
           sendbuf to the accumbuf, in order to simplfy the loops */
        if (!smpi_op_is_commute(op)) {
            smpi_datatype_copy(
                                                (char*)sendtmpbuf, original_count, datatype,
                                                (char*)accumbuf, original_count, datatype);
        }
        /* Allocate two buffers for incoming segments */
        real_segment_size = true_extent + (count_by_segment - 1) * extent;
        inbuf_free[0] = (char*) malloc(real_segment_size);
        if( inbuf_free[0] == NULL ) { 
            line = __LINE__; ret = -1; goto error_hndl; 
        }
        inbuf[0] = inbuf_free[0] - lower_bound;
        /* if there is chance to overlap communication -
           allocate second buffer */
        if( (num_segments > 1) || (tree->tree_nextsize > 1) ) {
            inbuf_free[1] = (char*) malloc(real_segment_size);
            if( inbuf_free[1] == NULL ) { 
                line = __LINE__; ret = -1; goto error_hndl;
            }
            inbuf[1] = inbuf_free[1] - lower_bound;
        } 

        /* reset input buffer index and receive count */
        inbi = 0;
        recvcount = 0;
        /* for each segment */
        for( segindex = 0; segindex <= num_segments; segindex++ ) {
            prevcount = recvcount;
            /* recvcount - number of elements in current segment */
            recvcount = count_by_segment;
            if( segindex == (num_segments-1) )
                recvcount = original_count - count_by_segment * segindex;

            /* for each child */
            for( i = 0; i < tree->tree_nextsize; i++ ) {
                /**
                 * We try to overlap communication:
                 * either with next segment or with the next child
                 */
                /* post irecv for current segindex on current child */
                if( segindex < num_segments ) {
                    void* local_recvbuf = inbuf[inbi];
                    if( 0 == i ) {
                        /* for the first step (1st child per segment) and 
                         * commutative operations we might be able to irecv 
                         * directly into the accumulate buffer so that we can 
                         * reduce(op) this with our sendbuf in one step as 
                         * ompi_op_reduce only has two buffer pointers, 
                         * this avoids an extra memory copy.
                         *
                         * BUT if the operation is non-commutative or 
                         * we are root and are USING MPI_IN_PLACE this is wrong!
                         */
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_recvbuf = accumbuf + segindex * segment_increment;
                        }
                    }

                    reqs[inbi]=smpi_mpi_irecv(local_recvbuf, recvcount, datatype,
                                             tree->tree_next[i], 
                                             COLL_TAG_REDUCE, comm
                                             );
                }
                /* wait for previous req to complete, if any.
                   if there are no requests reqs[inbi ^1] will be 
                   MPI_REQUEST_NULL. */
                /* wait on data from last child for previous segment */
                smpi_mpi_waitall( 1, &reqs[inbi ^ 1], 
                                             MPI_STATUSES_IGNORE );
                local_op_buffer = inbuf[inbi ^ 1];
                if( i > 0 ) {
                    /* our first operation is to combine our own [sendbuf] data 
                     * with the data we recvd from down stream (but only 
                     * the operation is commutative and if we are not root and 
                     * not using MPI_IN_PLACE)
                     */
                    if( 1 == i ) {
                        if( (smpi_op_is_commute(op)) && 
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + segindex * segment_increment;
                        }
                    }
                    /* apply operation */
                    smpi_op_apply(op, local_op_buffer, 
                                   accumbuf + segindex * segment_increment, 
                                   &recvcount, &datatype );
                } else if ( segindex > 0 ) {
                    void* accumulator = accumbuf + (segindex-1) * segment_increment;
                    if( tree->tree_nextsize <= 1 ) {
                        if( (smpi_op_is_commute(op)) &&
                            !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) {
                            local_op_buffer = sendtmpbuf + (segindex-1) * segment_increment;
                        }
                    }
                    smpi_op_apply(op, local_op_buffer, accumulator, &prevcount, 
                                   &datatype );

                    /* all reduced on available data this step (i) complete, 
                     * pass to the next process unless you are the root.
                     */
                    if (rank != tree->tree_root) {
                        /* send combined/accumulated data to parent */
                        smpi_mpi_send( accumulator, prevcount, 
                                                  datatype, tree->tree_prev, 
                                                  COLL_TAG_REDUCE,
                                                  comm);
                    }

                    /* we stop when segindex = number of segments 
                       (i.e. we do num_segment+1 steps for pipelining */
                    if (segindex == num_segments) break;
                }

                /* update input buffer index */
                inbi = inbi ^ 1;
            } /* end of for each child */
        } /* end of for each segment */

        /* clean up */
        if( inbuf_free[0] != NULL) free(inbuf_free[0]);
        if( inbuf_free[1] != NULL) free(inbuf_free[1]);
        if( accumbuf_free != NULL ) free(accumbuf_free);
    }

    /* leaf nodes 
       Depending on the value of max_outstanding_reqs and 
       the number of segments we have two options:
       - send all segments using blocking send to the parent, or
       - avoid overflooding the parent nodes by limiting the number of 
       outstanding requests to max_oustanding_reqs.
       TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size 
       for the current communication, synchronization should be used only 
       when the message/segment size is smaller than the eager size.
    */
    else {

        /* If the number of segments is less than a maximum number of oustanding
           requests or there is no limit on the maximum number of outstanding 
           requests, we send data to the parent using blocking send */
        if ((0 == max_outstanding_reqs) || 
            (num_segments <= max_outstanding_reqs)) {
            
            segindex = 0;
            while ( original_count > 0) {
                if (original_count < count_by_segment) {
                    count_by_segment = original_count;
                }
                smpi_mpi_send((char*)sendbuf + 
                                         segindex * segment_increment,
                                         count_by_segment, datatype,
                                         tree->tree_prev, 
                                         COLL_TAG_REDUCE,
                                         comm) ;
                segindex++;
                original_count -= count_by_segment;
            }
        }

        /* Otherwise, introduce flow control:
           - post max_outstanding_reqs non-blocking synchronous send,
           - for remaining segments
           - wait for a ssend to complete, and post the next one.
           - wait for all outstanding sends to complete.
        */
        else {

            int creq = 0;
            MPI_Request* sreq = NULL;

            sreq = (MPI_Request*) calloc( max_outstanding_reqs,
                                              sizeof(MPI_Request ) );
            if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; }

            /* post first group of requests */
            for (segindex = 0; segindex < max_outstanding_reqs; segindex++) {
                sreq[segindex]=smpi_mpi_isend((char*)sendbuf +
                                          segindex * segment_increment,
                                          count_by_segment, datatype,
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm);
                original_count -= count_by_segment;
            }

            creq = 0;
            while ( original_count > 0 ) {
                /* wait on a posted request to complete */
                smpi_mpi_wait(&sreq[creq], MPI_STATUS_IGNORE);
                sreq[creq] = MPI_REQUEST_NULL;

                if( original_count < count_by_segment ) {
                    count_by_segment = original_count;
                }
                sreq[creq]=smpi_mpi_isend((char*)sendbuf + 
                                          segindex * segment_increment, 
                                          count_by_segment, datatype, 
                                          tree->tree_prev, 
                                          COLL_TAG_REDUCE,
                                          comm );
                creq = (creq + 1) % max_outstanding_reqs;
                segindex++;
                original_count -= count_by_segment;
            }

            /* Wait on the remaining request to complete */
            smpi_mpi_waitall( max_outstanding_reqs, sreq, 
                                         MPI_STATUSES_IGNORE );

            /* free requests */
            free(sreq);
        }
    }
    return MPI_SUCCESS;

 error_hndl:  /* error handler */
    XBT_DEBUG("ERROR_HNDL: node %d file %s line %d error %d\n", 
                   rank, __FILE__, line, ret );
    if( inbuf_free[0] != NULL ) free(inbuf_free[0]);
    if( inbuf_free[1] != NULL ) free(inbuf_free[1]);
    if( accumbuf_free != NULL ) free(accumbuf);
    return ret;
}
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count,
                                                     MPI_Datatype datatype,
                                                     int root, MPI_Comm comm)
{
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;


  MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int rank, size;
  int i, j, k;
  int tag = -COLL_TAG_BCAST;
  int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int sent_count;
  int header_index;
  int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
  char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
  int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  /* source and destination */
  int to, from;



  rank = smpi_comm_rank(MPI_COMM_WORLD);
  size = smpi_comm_size(MPI_COMM_WORLD);


  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < max_node; i++) {
    already_sent[i] = 0;
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {
    segment = count;
    pipe_length = 1;
  }

  /* start pipeline bcast */

  send_request_array =
      (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
  recv_request_array =
      (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
  send_status_array =
      (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
  recv_status_array =
      (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

  /* root */
  if (rank == 0) {
    sent_count = 0;
    int iteration = 0;

    for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++)
      will_send[i] = 0;
    while (sent_count < (size - 1)) {
      iteration++;

      /* loop k times to let more processes arrive before start sending data */
      for (k = 0; k < 3; k++) {
        for (i = 1; i < size; i++) {
          if ((already_sent[i] == 0) && (will_send[i] == 0)) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                       &temp_status_array[i]);
            if (flag_array[i] == 1) {
              will_send[i] = 1;
              smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
                       &status);
              i = 0;
            }
          }
        }
      }

      header_index = 0;

      /* recv 1-byte message */
      for (i = 1; i < size; i++) {
        /* message arrive */
        if ((will_send[i] == 1) && (already_sent[i] == 0)) {
          header_buf[header_index] = i;
          header_index++;
          sent_count++;

          /* will send in the next step */
          already_sent[i] = 1;
        }
      }

      /* send header followed by data */
      if (header_index != 0) {
        header_buf[header_index] = -1;
        to = header_buf[0];

        /* send header */
        smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);

        /* send data - pipeline */
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }


      /* end - send header followed by data */
      /* randomly MPI_Send to one node */
      /* this part has been commented out - performance-wise */
      else if (2 == 3) {
        /* search for the first node that never received data before */
        for (i = 0; i < size; i++) {
          if (i == root)
            continue;
          if (already_sent[i] == 0) {
            header_buf[0] = i;
            header_buf[1] = -1;
            to = i;

            smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);

            /* still need to chop data so that we can use the same non-root code */
            for (j = 0; j < pipe_length; j++) {
              smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm);
            }
          }
        }
      }
    }                           /* end - while (send_count < size-1) loop */
  }

  /* end - root */
  /* none root */
  else {

    /* send 1-byte message to root */
    smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

    /* wait for header forward when required */
    request = smpi_mpi_irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm);
    smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

    /* search for where it is */
    int myordering = 0;
    while (rank != header_buf[myordering]) {
      myordering++;
    }

    to = header_buf[myordering + 1];
    if (myordering == 0) {
      from = 0;
    } else {
      from = header_buf[myordering - 1];
    }

    /* send header when required */
    if (to != -1) {
      smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);
    }

    /* receive data */

    for (i = 0; i < pipe_length; i++) {
      recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm);
    }

    /* forward data */
    if (to != -1) {
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
        send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm);
      }
      smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
    }

    /* recv only */
    else {
      smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array);
    }
  }

  free(send_request_array);
  free(recv_request_array);
  free(send_status_array);
  free(recv_status_array);
  /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_arrival_pattern_aware_wait use default MPI_bcast.");	  	  
    smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
  }

  return MPI_SUCCESS;
}
Exemple #19
0
int smpi_coll_tuned_allgather_SMP_NTS(void *sbuf, int scount,
                                      MPI_Datatype stype, void *rbuf,
                                      int rcount, MPI_Datatype rtype,
                                      MPI_Comm comm)
{
  int src, dst, comm_size, rank;
  comm_size = smpi_comm_size(comm);
  rank = smpi_comm_rank(comm);
  MPI_Aint rextent, sextent;
  rextent = smpi_datatype_get_extent(rtype);
  sextent = smpi_datatype_get_extent(stype);
  int tag = COLL_TAG_ALLGATHER;
  MPI_Request request;
  MPI_Request rrequest_array[128];

  MPI_Status status;
  int i, send_offset, recv_offset;
  int intra_rank, inter_rank;
  intra_rank = rank % NUM_CORE;
  inter_rank = rank / NUM_CORE;
  int inter_comm_size = (comm_size + NUM_CORE - 1) / NUM_CORE;
  int num_core_in_current_smp = NUM_CORE;

  /* for too small number of processes, use default implementation */
  if (comm_size <= NUM_CORE) {
    XBT_WARN("MPI_allgather_SMP_NTS use default MPI_allgather.");  	  
    smpi_mpi_allgather(sbuf, scount, stype, rbuf, rcount, rtype, comm);
    return MPI_SUCCESS;    
  }

  // the last SMP node may have fewer number of running processes than all others
  if (inter_rank == (inter_comm_size - 1)) {
    num_core_in_current_smp = comm_size - (inter_rank * NUM_CORE);
  }
  //copy corresponding message from sbuf to rbuf
  recv_offset = rank * rextent * rcount;
  smpi_mpi_sendrecv(sbuf, scount, stype, rank, tag,
               ((char *) rbuf + recv_offset), rcount, rtype, rank, tag, comm,
               &status);

  //gather to root of each SMP

  for (i = 1; i < num_core_in_current_smp; i++) {

    dst =
        (inter_rank * NUM_CORE) + (intra_rank + i) % (num_core_in_current_smp);
    src =
        (inter_rank * NUM_CORE) + (intra_rank - i +
                                   num_core_in_current_smp) %
        (num_core_in_current_smp);
    recv_offset = src * rextent * rcount;

    smpi_mpi_sendrecv(sbuf, scount, stype, dst, tag,
                 ((char *) rbuf + recv_offset), rcount, rtype, src, tag, comm,
                 &status);

  }

  // INTER-SMP-ALLGATHER 
  // Every root of each SMP node post INTER-Sendrecv, then do INTRA-Bcast for each receiving message
  // Use logical ring algorithm

  // root of each SMP
  if (intra_rank == 0) {
    src = ((inter_rank - 1 + inter_comm_size) % inter_comm_size) * NUM_CORE;
    dst = ((inter_rank + 1) % inter_comm_size) * NUM_CORE;

    // post all inter Irecv
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      rrequest_array[i] = smpi_mpi_irecv((char *)rbuf+recv_offset, rcount * NUM_CORE, rtype, src, tag+i, comm);
    }

    // send first message
    send_offset =
        ((inter_rank +
          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
    smpi_mpi_isend((char *) rbuf + send_offset, scount * NUM_CORE, stype,
                   dst, tag, comm);

    // loop : recv-inter , send-inter, send-intra (linear-bcast)
    for (i = 0; i < inter_comm_size - 2; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      smpi_mpi_wait(&rrequest_array[i], &status);
      smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                     dst, tag + i + 1, comm);
      if (num_core_in_current_smp > 1) {
        request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                  (rank + 1), tag + i + 1, comm);
      }
    }

    // recv last message and send_intra
    recv_offset =
        ((inter_rank - i - 1 +
          inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
    //recv_offset = ((inter_rank + 1) % inter_comm_size) * NUM_CORE * sextent * scount;
    //i=inter_comm_size-2;
    smpi_mpi_wait(&rrequest_array[i], &status);
    if (num_core_in_current_smp > 1) {
      request = smpi_mpi_isend((char *) rbuf + recv_offset, scount * NUM_CORE, stype,
                (rank + 1), tag + i + 1, comm);
    }
  }
  // last rank of each SMP
  else if (intra_rank == (num_core_in_current_smp - 1)) {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
                rank - 1, tag + i + 1, comm);
      smpi_mpi_wait(&request, &status);
    }
  }
  // intermediate rank of each SMP
  else {
    for (i = 0; i < inter_comm_size - 1; i++) {
      recv_offset =
          ((inter_rank - i - 1 +
            inter_comm_size) % inter_comm_size) * NUM_CORE * sextent * scount;
      request = smpi_mpi_irecv((char *) rbuf + recv_offset, (rcount * NUM_CORE), rtype,
                rank - 1, tag + i + 1, comm);
      smpi_mpi_wait(&request, &status);
      request = smpi_mpi_isend((char *) rbuf + recv_offset, (scount * NUM_CORE), stype,
                (rank + 1), tag + i + 1, comm);
    }
  }

  return MPI_SUCCESS;
}