Example #1
0
static void action_allReduce(const char *const *action) {
  double comm_size = parse_double(action[2]);
  double comp_size = parse_double(action[3]);
  double clock = smpi_process_simulated_elapsed();
#ifdef HAVE_TRACING
  int rank = smpi_comm_rank(MPI_COMM_WORLD);
  TRACE_smpi_computing_out(rank);
  TRACE_smpi_collective_in(rank, -1, __FUNCTION__);
#endif
  smpi_mpi_reduce(NULL, NULL, comm_size, MPI_BYTE, MPI_OP_NULL, 0, MPI_COMM_WORLD);
  smpi_execute_flops(comp_size);
  smpi_mpi_bcast(NULL, comm_size, MPI_BYTE, 0, MPI_COMM_WORLD);
#ifdef HAVE_TRACING
  TRACE_smpi_collective_out(rank, -1, __FUNCTION__);
  TRACE_smpi_computing_in(rank);
#endif

  if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){
    char *name = xbt_str_join_array(action, " ");
    XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock);
    free(name);
  }
}
Example #2
0
static void action_bcast(const char *const *action)
{
  double size = parse_double(action[2]);
  double clock = smpi_process_simulated_elapsed();
#ifdef HAVE_TRACING
  int rank = smpi_comm_rank(MPI_COMM_WORLD);
  TRACE_smpi_computing_out(rank);
  int root_traced = smpi_group_rank(smpi_comm_group(MPI_COMM_WORLD), 0);
  TRACE_smpi_collective_in(rank, root_traced, __FUNCTION__);
#endif

  smpi_mpi_bcast(NULL, size, MPI_BYTE, 0, MPI_COMM_WORLD);
#ifdef HAVE_TRACING
  TRACE_smpi_collective_out(rank, root_traced, __FUNCTION__);
  TRACE_smpi_computing_in(rank);
#endif

  if (XBT_LOG_ISENABLED(smpi_replay, xbt_log_priority_verbose)){
    char *name = xbt_str_join_array(action, " ");
    XBT_VERB("%s %f", name, smpi_process_simulated_elapsed()-clock);
    free(name);
  }
}
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count,
                                                     MPI_Datatype datatype,
                                                     int root, MPI_Comm comm)
{
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;


  MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int rank, size;
  int i, j, k;
  int tag = -COLL_TAG_BCAST;
  int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int sent_count;
  int header_index;
  int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
  char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
  int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  /* source and destination */
  int to, from;



  rank = smpi_comm_rank(MPI_COMM_WORLD);
  size = smpi_comm_size(MPI_COMM_WORLD);


  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < max_node; i++) {
    already_sent[i] = 0;
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {
    segment = count;
    pipe_length = 1;
  }

  /* start pipeline bcast */

  send_request_array =
      (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
  recv_request_array =
      (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
  send_status_array =
      (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
  recv_status_array =
      (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

  /* root */
  if (rank == 0) {
    sent_count = 0;
    int iteration = 0;

    for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++)
      will_send[i] = 0;
    while (sent_count < (size - 1)) {
      iteration++;

      /* loop k times to let more processes arrive before start sending data */
      for (k = 0; k < 3; k++) {
        for (i = 1; i < size; i++) {
          if ((already_sent[i] == 0) && (will_send[i] == 0)) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i],
                       &temp_status_array[i]);
            if (flag_array[i] == 1) {
              will_send[i] = 1;
              smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD,
                       &status);
              i = 0;
            }
          }
        }
      }

      header_index = 0;

      /* recv 1-byte message */
      for (i = 1; i < size; i++) {
        /* message arrive */
        if ((will_send[i] == 1) && (already_sent[i] == 0)) {
          header_buf[header_index] = i;
          header_index++;
          sent_count++;

          /* will send in the next step */
          already_sent[i] = 1;
        }
      }

      /* send header followed by data */
      if (header_index != 0) {
        header_buf[header_index] = -1;
        to = header_buf[0];

        /* send header */
        smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);

        /* send data - pipeline */
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }


      /* end - send header followed by data */
      /* randomly MPI_Send to one node */
      /* this part has been commented out - performance-wise */
      else if (2 == 3) {
        /* search for the first node that never received data before */
        for (i = 0; i < size; i++) {
          if (i == root)
            continue;
          if (already_sent[i] == 0) {
            header_buf[0] = i;
            header_buf[1] = -1;
            to = i;

            smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);

            /* still need to chop data so that we can use the same non-root code */
            for (j = 0; j < pipe_length; j++) {
              smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm);
            }
          }
        }
      }
    }                           /* end - while (send_count < size-1) loop */
  }

  /* end - root */
  /* none root */
  else {

    /* send 1-byte message to root */
    smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

    /* wait for header forward when required */
    request = smpi_mpi_irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm);
    smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

    /* search for where it is */
    int myordering = 0;
    while (rank != header_buf[myordering]) {
      myordering++;
    }

    to = header_buf[myordering + 1];
    if (myordering == 0) {
      from = 0;
    } else {
      from = header_buf[myordering - 1];
    }

    /* send header when required */
    if (to != -1) {
      smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm);
    }

    /* receive data */

    for (i = 0; i < pipe_length; i++) {
      recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm);
    }

    /* forward data */
    if (to != -1) {
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
        send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm);
      }
      smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
    }

    /* recv only */
    else {
      smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array);
    }
  }

  free(send_request_array);
  free(recv_request_array);
  free(send_status_array);
  free(recv_status_array);
  /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_arrival_pattern_aware_wait use default MPI_bcast.");	  	  
    smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
  }

  return MPI_SUCCESS;
}
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count,
                                          MPI_Datatype datatype, int root,
                                          MPI_Comm comm)
{
  int tag = -COLL_TAG_BCAST;//in order to use ANY_TAG, make this one positive
  int header_tag = 10;
  MPI_Status status;

  int curr_remainder;
  int curr_size;
  int curr_increment;
  int send_offset;
  int recv_offset;
  int send_count;
  int recv_count;

  MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];

  int rank, size;
  int i, k;

  int sent_count;
  int header_index;
  int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE];
  char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE];
  int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE;
  int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);


  /* source and destination */
  int to, from;

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);

  /* message too small */
  if (count < size) {
    XBT_WARN("MPI_bcast_arrival_scatter use default MPI_bcast.");
    smpi_mpi_bcast(buf, count, datatype, root, comm);
    return MPI_SUCCESS;        
  }



  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag - 1, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag - 1, comm, &status);
    }
  }


  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < max_node; i++) {
    already_sent[i] = 0;
  }

  /* start bcast */

  /* root */
  if (rank == 0) {

    for (i = 0; i < max_node; i++)
      will_send[i] = 0;

    sent_count = 0;
    while (sent_count < (size - 1)) {

      for (k = 0; k < 3; k++) {
        for (i = 1; i < size; i++) {
          if ((already_sent[i] == 0) && (will_send[i] == 0)) {
            smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                       &temp_status_array[i]);
            if (flag_array[i] == 1) {
              will_send[i] = 1;
              smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                       &status);
              i = 0;
            }
          }
        }
      }
      header_index = 0;

      /* recv 1-byte message in this round */
      for (i = 1; i < size; i++) {
        /* message arrive */
        if ((will_send[i] == 1) && (already_sent[i] == 0)) {
          header_buf[header_index] = i;
          header_index++;
          sent_count++;

          /* will send in the next step */
          already_sent[i] = 1;
        }
      }

      /*
         if (header_index != 0) {
         printf("header index = %d node = ",header_index);
         for (i=0;i<header_index;i++) {
         printf("%d ",header_buf[i]);
         }
         printf("\n");
         }
       */

      /* send header followed by data */
      if (header_index != 0) {
        header_buf[header_index] = -1;

        /* send header */
        for (i = 0; i < header_index; i++) {
          to = header_buf[i];
          smpi_mpi_send(header_buf, header_size, MPI_INT, to, header_tag, comm);
        }

        curr_remainder = count % header_index;
        curr_size = (count / header_index);
        curr_increment = curr_size * extent;

        /* send data */

        for (i = 0; i < header_index; i++) {
          to = header_buf[i];
          if ((i == (header_index - 1)) || (curr_size == 0))
            curr_size += curr_remainder;
          //printf("Root send to %d index %d\n",to,(i*curr_increment));
          smpi_mpi_send((char *) buf + (i * curr_increment), curr_size, datatype, to,
                   tag, comm);
        }
      }
    }                           /* while (sent_count < size-1) */
  }

  /* rank 0 */
  /* none root */
  else {
    /* send 1-byte message to root */
    smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

    /* wait for header forward when required */
    smpi_mpi_recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status);

    /* search for where it is */
    int myordering = 0;
    while (rank != header_buf[myordering]) {
      myordering++;
    }

    int total_nodes = 0;
    while (header_buf[total_nodes] != -1) {
      total_nodes++;
    }

    curr_remainder = count % total_nodes;
    curr_size = (count / total_nodes);
    curr_increment = curr_size * extent;
    int recv_size = curr_size;

    /* receive data */
    if (myordering == (total_nodes - 1))
      recv_size += curr_remainder;
    smpi_mpi_recv((char *) buf + (myordering * curr_increment), recv_size, datatype,
             0, tag, comm, &status);

    /* at this point all nodes in this set perform all-gather operation */
    to = header_buf[myordering + 1];
    from = header_buf[myordering - 1];
    if (myordering == 0)
      from = header_buf[total_nodes - 1];
    if (myordering == (total_nodes - 1))
      to = header_buf[0];


    /* last segment may have a larger size since it also include the remainder */
    int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent;


    /* allgather */
    for (i = 0; i < total_nodes - 1; i++) {
      send_offset =
          ((myordering - i + total_nodes) % total_nodes) * curr_increment;
      recv_offset =
          ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment;

      /* adjust size */
      if (send_offset != last_segment_ptr)
        send_count = curr_size;
      else
        send_count = curr_size + curr_remainder;

      if (recv_offset != last_segment_ptr)
        recv_count = curr_size;
      else
        recv_count = curr_size + curr_remainder;

      //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count);
      //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count);


      smpi_mpi_sendrecv((char *) buf + send_offset, send_count, datatype, to,
                   tag + i, (char *) buf + recv_offset, recv_count, datatype,
                   from, tag + i, comm, &status);
    }
  }                             /* non-root */

  return MPI_SUCCESS;
}
Example #5
0
int smpi_coll_tuned_bcast_NTSB(void *buf, int count, MPI_Datatype datatype,
                               int root, MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  int rank, size;
  int i;

  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  rank = smpi_comm_rank(MPI_COMM_WORLD);
  size = smpi_comm_size(MPI_COMM_WORLD);

  /* source node and destination nodes (same through out the functions) */
  int from = (rank - 1) / 2;
  int to_left = rank * 2 + 1;
  int to_right = rank * 2 + 2;
  if (to_left >= size)
    to_left = -1;
  if (to_right >= size)
    to_right = -1;

  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_NTSB_segment_size_in_byte / extent;

  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {

    /* case: root */
    if (rank == 0) {
      /* case root has only a left child */
      if (to_right == -1) {
        smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
      }
      /* case root has both left and right children */
      else {
        smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
        smpi_mpi_send(buf, count, datatype, to_right, tag, comm);
      }
    }

    /* case: leaf ==> receive only */
    else if (to_left == -1) {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
    }

    /* case: intermidiate node with only left child ==> relay message */
    else if (to_right == -1) {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
      smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
    }

    /* case: intermidiate node with both left and right children ==> relay message */
    else {
      smpi_mpi_recv(buf, count, datatype, from, tag, comm, &status);
      smpi_mpi_send(buf, count, datatype, to_left, tag, comm);
      smpi_mpi_send(buf, count, datatype, to_right, tag, comm);
    }
    return MPI_SUCCESS;
  }
  // pipelining
  else {

    send_request_array =
        (MPI_Request *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc(2 * (size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));



    /* case: root */
    if (rank == 0) {
      /* case root has only a left child */
      if (to_right == -1) {
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                    tag + i, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }
      /* case root has both left and right children */
      else {
        for (i = 0; i < pipe_length; i++) {
          send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                    tag + i, comm);
          send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right,
                    tag + i, comm);
        }
        smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array);
      }
    }

    /* case: leaf ==> receive only */
    else if (to_left == -1) {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array);
    }

    /* case: intermidiate node with only left child ==> relay message */
    else if (to_right == -1) {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], &status);
        send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                  tag + i, comm);
      }
      smpi_mpi_waitall(pipe_length, send_request_array, send_status_array);

    }
    /* case: intermidiate node with both left and right children ==> relay message */
    else {
      for (i = 0; i < pipe_length; i++) {
        recv_request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype, from,
                  tag + i, comm);
      }
      for (i = 0; i < pipe_length; i++) {
        smpi_mpi_wait(&recv_request_array[i], &status);
        send_request_array[i] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_left,
                  tag + i, comm);
        send_request_array[i + pipe_length] = smpi_mpi_isend((char *) buf + (i * increment), segment, datatype, to_right,
                  tag + i, comm);
      }
      smpi_mpi_waitall((2 * pipe_length), send_request_array, send_status_array);
    }

    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);
  }                             /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_NTSB use default MPI_bcast.");	  	  
    smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return MPI_SUCCESS;
}
Example #6
0
int smpi_coll_tuned_bcast_SMP_binary(void *buf, int count,
                                     MPI_Datatype datatype, int root,
                                     MPI_Comm comm)
{
  int tag = COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *request_array;
  MPI_Status *status_array;
  int rank, size;
  int i;
  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);
  if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){
    smpi_comm_init_smp(comm);
  }
  int host_num_core=1;
  if (smpi_comm_is_uniform(comm)){
    host_num_core = smpi_comm_size(smpi_comm_get_intra_comm(comm));
  }else{
    //implementation buggy in this case
    return smpi_coll_tuned_bcast_mpich( buf , count, datatype,
              root, comm);
  }

  int segment = bcast_SMP_binary_segment_byte / extent;
  int pipe_length = count / segment;
  int remainder = count % segment;

  int to_intra_left = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 1;
  int to_intra_right = (rank / host_num_core) * host_num_core + (rank % host_num_core) * 2 + 2;
  int to_inter_left = ((rank / host_num_core) * 2 + 1) * host_num_core;
  int to_inter_right = ((rank / host_num_core) * 2 + 2) * host_num_core;
  int from_inter = (((rank / host_num_core) - 1) / 2) * host_num_core;
  int from_intra = (rank / host_num_core) * host_num_core + ((rank % host_num_core) - 1) / 2;
  int increment = segment * extent;

  int base = (rank / host_num_core) * host_num_core;
  int num_core = host_num_core;
  if (((rank / host_num_core) * host_num_core) == ((size / host_num_core) * host_num_core))
    num_core = size - (rank / host_num_core) * host_num_core;

  // if root is not zero send to rank zero first
  if (root != 0) {
    if (rank == root)
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    else if (rank == 0)
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
  }
  // when a message is smaller than a block size => no pipeline 
  if (count <= segment) {
    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
        if (to_inter_left < size)
          smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        request = smpi_mpi_irecv(buf, count, datatype, from_inter, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_inter_left, tag, comm);
        if (to_inter_right < size)
          smpi_mpi_send(buf, count, datatype, to_inter_right, tag, comm);
        if ((to_intra_left - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }
    // case non ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm);
        smpi_mpi_wait(&request, &status);
      }
      // case intermediate
      else {
        request = smpi_mpi_irecv(buf, count, datatype, from_intra, tag, comm);
        smpi_mpi_wait(&request, &status);
        smpi_mpi_send(buf, count, datatype, to_intra_left, tag, comm);
        if ((to_intra_right - base) < num_core)
          smpi_mpi_send(buf, count, datatype, to_intra_right, tag, comm);
      }
    }

    return MPI_SUCCESS;
  }

  // pipeline bcast
  else {
    request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    // case ROOT-of-each-SMP
    if (rank % host_num_core == 0) {
      // case ROOT
      if (rank == 0) {
        for (i = 0; i < pipe_length; i++) {
          //printf("node %d left %d right %d\n",rank,to_inter_left,to_inter_right);
          if (to_inter_left < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case LEAVES ROOT-of-eash-SMP
      else if (to_inter_left >= size) {
        //printf("node %d from %d\n",rank,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
      // case INTERMEDIAT ROOT-of-each-SMP
      else {
        //printf("node %d left %d right %d from %d\n",rank,to_inter_left,to_inter_right,from_inter);
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_inter, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                   to_inter_left, (tag + i), comm);
          if (to_inter_right < size)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_inter_right, (tag + i), comm);
          if ((to_intra_left - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }
    // case non-ROOT-of-each-SMP
    else {
      // case leaves
      if ((to_intra_left - base) >= num_core) {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        smpi_mpi_waitall((pipe_length), request_array, status_array);
      }
      // case intermediate
      else {
        for (i = 0; i < pipe_length; i++) {
          request_array[i] = smpi_mpi_irecv((char *) buf + (i * increment), segment, datatype,
                    from_intra, (tag + i), comm);
        }
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&request_array[i], &status);
          smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                   to_intra_left, (tag + i), comm);
          if ((to_intra_right - base) < num_core)
            smpi_mpi_send((char *) buf + (i * increment), segment, datatype,
                     to_intra_right, (tag + i), comm);
        }
      }
    }

    free(request_array);
    free(status_array);
  }

  // when count is not divisible by block size, use default BCAST for the remainder
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_SMP_binary use default MPI_bcast.");	  
    smpi_mpi_bcast((char *) buf + (pipe_length * increment), remainder, datatype,
              root, comm);
  }

  return 1;
}
/* Non-topology-specific pipelined linear-bcast function */
int smpi_coll_tuned_bcast_arrival_pattern_aware(void *buf, int count,
                                                MPI_Datatype datatype, int root,
                                                MPI_Comm comm)
{
  int tag = -COLL_TAG_BCAST;
  MPI_Status status;
  MPI_Request request;
  MPI_Request *send_request_array;
  MPI_Request *recv_request_array;
  MPI_Status *send_status_array;
  MPI_Status *recv_status_array;

  MPI_Status temp_status_array[MAX_NODE];

  int rank, size;
  int i, j;

  int sent_count;
  int header_index;
  int flag_array[MAX_NODE];
  int already_sent[MAX_NODE];
  int to_clean[MAX_NODE];
  int header_buf[HEADER_SIZE];
  char temp_buf[MAX_NODE];

  MPI_Aint extent;
  extent = smpi_datatype_get_extent(datatype);

  /* destination */
  int to;



  rank = smpi_comm_rank(comm);
  size = smpi_comm_size(comm);


  /* segment is segment size in number of elements (not bytes) */
  int segment = bcast_NTSL_segment_size_in_byte / extent;
  segment =  segment == 0 ? 1 :segment; 
  /* pipeline length */
  int pipe_length = count / segment;

  /* use for buffer offset for sending and receiving data = segment size in byte */
  int increment = segment * extent;

  /* if the input size is not divisible by segment size => 
     the small remainder will be done with native implementation */
  int remainder = count % segment;

  /* if root is not zero send to rank zero first
     this can be modified to make it faster by using logical src, dst.
   */
  if (root != 0) {
    if (rank == root) {
      smpi_mpi_send(buf, count, datatype, 0, tag, comm);
    } else if (rank == 0) {
      smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status);
    }
  }

  /* value == 0 means root has not send data (or header) to the node yet */
  for (i = 0; i < MAX_NODE; i++) {
    already_sent[i] = 0;
    to_clean[i] = 0;
  }

  /* when a message is smaller than a block size => no pipeline */
  if (count <= segment) {
    if (rank == 0) {
      sent_count = 0;

      while (sent_count < (size - 1)) {
        for (i = 1; i < size; i++) {
          smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                     MPI_STATUSES_IGNORE);
        }

        header_index = 0;
        /* recv 1-byte message */
        for (i = 1; i < size; i++) {

          /* message arrive */
          if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
            smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, comm, &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_sent[i] = 1;
          }
        }

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);
          smpi_mpi_send(buf, count, datatype, to, tag, comm);
        }

        /* randomly MPI_Send to one */
        else {
          /* search for the first node that never received data before */
          for (i = 1; i < size; i++) {
            if (already_sent[i] == 0) {
              header_buf[0] = i;
              header_buf[1] = -1;
              smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm);
              smpi_mpi_send(buf, count, datatype, i, tag, comm);
              already_sent[i] = 1;
              sent_count++;
              break;
            }
          }
        }


      }                         /* while loop */
    }

    /* non-root */
    else {

      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header and data, forward when required */
      smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm,
               &status);
      smpi_mpi_recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header followed by data */
      if (header_buf[myordering + 1] != -1) {
        smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
        smpi_mpi_send(buf, count, datatype, header_buf[myordering + 1], tag, comm);
      }
    }
  }
  /* pipeline bcast */
  else {
    send_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    recv_request_array =
        (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request));
    send_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));
    recv_status_array =
        (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status));

    if (rank == 0) {
      //double start2 = MPI_Wtime();
      sent_count = 0;
      //int iteration = 0;
      while (sent_count < (size - 1)) {
        //iteration++;
        //start = MPI_Wtime();
        for (i = 1; i < size; i++) {
          smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i],
                     &temp_status_array[i]);
        }
        //total = MPI_Wtime() - start;
        //total *= 1000;
        //printf("Iprobe time = %.2f\n",total);
        header_index = 0;

        MPI_Wtime();
        /* recv 1-byte message */
        for (i = 1; i < size; i++) {
          /* message arrive */
          if ((flag_array[i] == 1) && (already_sent[i] == 0)) {
            smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                     &status);
            header_buf[header_index] = i;
            header_index++;
            sent_count++;

            /* will send in the next step */
            already_sent[i] = 1;
          }
        }
        //total = MPI_Wtime() - start;
        //total *= 1000;
        //printf("Recv 1-byte time = %.2f\n",total);

        /*
           if (header_index != 0) {
           printf("header index = %d node = ",header_index);
           for (i=0;i<header_index;i++) {
           printf("%d ",header_buf[i]);
           }
           printf("\n");
           }
         */

        /* send header followed by data */
        if (header_index != 0) {
          header_buf[header_index] = -1;
          to = header_buf[0];

          //start = MPI_Wtime();

          /* send header */
          smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

          //total = MPI_Wtime() - start;
          //total *= 1000;
          //printf("\tSend header to %d time = %.2f\n",to,total);

          //start = MPI_Wtime();

          /* send data - non-pipeline case */

          if (0 == 1) {
            //if (header_index == 1) {
            smpi_mpi_send(buf, count, datatype, to, tag, comm);
          }


          /* send data - pipeline */
          else {
            for (i = 0; i < pipe_length; i++) {
              smpi_mpi_send((char *)buf + (i * increment), segment, datatype, to, tag, comm);
            }
            //smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
          }
          //total = MPI_Wtime() - start;
          //total *= 1000;
          //printf("\tSend data to %d time = %.2f\n",to,total);

        }



        /* randomly MPI_Send to one node */
        else {
          /* search for the first node that never received data before */
          for (i = 1; i < size; i++) {
            if (already_sent[i] == 0) {
              header_buf[0] = i;
              header_buf[1] = -1;
              to = i;

              //start = MPI_Wtime();
              smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm);

              /* still need to chop data so that we can use the same non-root code */
              for (j = 0; j < pipe_length; j++) {
                smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag,
                         comm);
              }

              //smpi_mpi_send(buf,count,datatype,to,tag,comm);
              //smpi_mpi_wait(&request,MPI_STATUS_IGNORE);

              //total = MPI_Wtime() - start;
              //total *= 1000;
              //printf("SEND TO SINGLE node %d time = %.2f\n",i,total);


              already_sent[i] = 1;
              to_clean[i]=1;
              sent_count++;
              break;
            }
          }
        }

      }                         /* while loop */

      for(i=0; i<size; i++)
        if(to_clean[i]!=0)smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm,
                     &status);
      //total = MPI_Wtime() - start2;
      //total *= 1000;
      //printf("Node zero iter = %d time = %.2f\n",iteration,total);
    }

    /* rank 0 */
    /* none root */
    else {
      /* send 1-byte message to root */
      smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm);

      /* wait for header forward when required */
      request = smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm);
      smpi_mpi_wait(&request, MPI_STATUS_IGNORE);

      /* search for where it is */
      int myordering = 0;
      while (rank != header_buf[myordering]) {
        myordering++;
      }

      /* send header when required */
      if (header_buf[myordering + 1] != -1) {
        smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1],
                 tag, comm);
      }

      /* receive data */

      if (0 == -1) {
        //if (header_buf[1] == -1) {
        request = smpi_mpi_irecv(buf, count, datatype, 0, tag, comm);
        smpi_mpi_wait(&request, MPI_STATUS_IGNORE);
        //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering);
      } else {
        for (i = 0; i < pipe_length; i++) {
          recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE,
                                                 tag, comm);
        }
      }

      /* send data */
      if (header_buf[myordering + 1] != -1) {
        for (i = 0; i < pipe_length; i++) {
          smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE);
          send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype,
                    header_buf[myordering + 1], tag, comm);
        }
        smpi_mpi_waitall((pipe_length), send_request_array, send_status_array);
      }else{
          smpi_mpi_waitall(pipe_length, recv_request_array, recv_status_array);
          }
    
    }

    free(send_request_array);
    free(recv_request_array);
    free(send_status_array);
    free(recv_status_array);
  }                             /* end pipeline */

  /* when count is not divisible by block size, use default BCAST for the remainder */
  if ((remainder != 0) && (count > segment)) {
    XBT_WARN("MPI_bcast_arrival_pattern_aware use default MPI_bcast.");	  
    smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm);
  }

  return MPI_SUCCESS;
}