/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_pattern_aware_wait(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int rank, size; int i, j, k; int tag = -COLL_TAG_BCAST; int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int sent_count; int header_index; int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE]; char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* source and destination */ int to, from; rank = smpi_comm_rank(MPI_COMM_WORLD); size = smpi_comm_size(MPI_COMM_WORLD); /* segment is segment size in number of elements (not bytes) */ int segment = bcast_arrival_pattern_aware_wait_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < max_node; i++) { already_sent[i] = 0; } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { segment = count; pipe_length = 1; } /* start pipeline bcast */ send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); /* root */ if (rank == 0) { sent_count = 0; int iteration = 0; for (i = 0; i < BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; i++) will_send[i] = 0; while (sent_count < (size - 1)) { iteration++; /* loop k times to let more processes arrive before start sending data */ for (k = 0; k < 3; k++) { for (i = 1; i < size; i++) { if ((already_sent[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); i = 0; } } } } header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((will_send[i] == 1) && (already_sent[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* send data - pipeline */ for (i = 0; i < pipe_length; i++) { send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* end - send header followed by data */ /* randomly MPI_Send to one node */ /* this part has been commented out - performance-wise */ else if (2 == 3) { /* search for the first node that never received data before */ for (i = 0; i < size; i++) { if (i == root) continue; if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; to = i; smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); /* still need to chop data so that we can use the same non-root code */ for (j = 0; j < pipe_length; j++) { smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm); } } } } } /* end - while (send_count < size-1) loop */ } /* end - root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request = smpi_mpi_irecv(header_buf, header_size, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } to = header_buf[myordering + 1]; if (myordering == 0) { from = 0; } else { from = header_buf[myordering - 1]; } /* send header when required */ if (to != -1) { smpi_mpi_send(header_buf, header_size, MPI_INT, to, tag, comm); } /* receive data */ for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, from, tag, comm); } /* forward data */ if (to != -1) { for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* recv only */ else { smpi_mpi_waitall((pipe_length), recv_request_array, recv_status_array); } } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_arrival_pattern_aware_wait use default MPI_bcast."); smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }
/* Non-topology-specific pipelined linear-reduce function */ int smpi_coll_tuned_reduce_arrival_pattern_aware(void *buf, void *rbuf, int count, MPI_Datatype datatype, MPI_Op op, int root, MPI_Comm comm) { int rank; rank = smpi_comm_rank(comm); int tag = -COLL_TAG_REDUCE; MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[MAX_NODE]; int size; int i; int sent_count; int header_index; int flag_array[MAX_NODE]; int already_received[MAX_NODE]; int header_buf[HEADER_SIZE]; char temp_buf[MAX_NODE]; MPI_Aint extent, lb; smpi_datatype_extent(datatype, &lb, &extent); /* source and destination */ int to, from; size=smpi_comm_size(comm); rank=smpi_comm_rank(comm); /* segment is segment size in number of elements (not bytes) */ int segment = reduce_arrival_pattern_aware_segment_size_in_byte / extent; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < MAX_NODE; i++) { already_received[i] = 0; } char *tmp_buf; tmp_buf = (char *) xbt_malloc(count * extent); smpi_mpi_sendrecv(buf, count, datatype, rank, tag, rbuf, count, datatype, rank, tag, comm, &status); /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { if (rank == 0) { sent_count = 0; while (sent_count < (size - 1)) { for (i = 1; i < size; i++) { if (already_received[i] == 0) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], MPI_STATUSES_IGNORE); simcall_process_sleep(0.0001); } } header_index = 0; /* recv 1-byte message */ for (i = 0; i < size; i++) { if (i == rank) continue; /* 1-byte message arrive */ if ((flag_array[i] == 1) && (already_received[i] == 0)) { smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); header_buf[header_index] = i; header_index++; sent_count++; //printf("root send to %d recv from %d : data = ",to,from); /* for (i=0;i<=header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); */ /* will receive in the next step */ already_received[i] = 1; } } /* send header followed by receive and reduce data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; from = header_buf[header_index - 1]; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); smpi_mpi_recv(tmp_buf, count, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); } } /* while loop */ } /* root */ /* non-root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header and data, forward when required */ smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm, &status); // smpi_mpi_recv(buf,count,datatype,MPI_ANY_SOURCE,tag,comm,&status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* forward header */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } //printf("node %d ordering %d\n",rank,myordering); /* receive, reduce, and forward data */ /* send only */ if (myordering == 0) { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } /* recv, reduce, send */ else { if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } from = header_buf[myordering - 1]; smpi_mpi_recv(tmp_buf, count, datatype, header_buf[myordering - 1], tag, comm, &status); smpi_op_apply(op, tmp_buf, rbuf, &count, &datatype); smpi_mpi_send(rbuf, count, datatype, to, tag, comm); } } /* non-root */ } /* pipeline bcast */ else { // printf("node %d start\n",rank); send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); if (rank == 0) { sent_count = 0; int will_send[MAX_NODE]; for (i = 0; i < MAX_NODE; i++) will_send[i] = 0; /* loop until all data are received (sent) */ while (sent_count < (size - 1)) { int k; for (k = 0; k < 1; k++) { for (i = 1; i < size; i++) { //if (i == rank) //continue; if ((already_received[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, MPI_COMM_WORLD, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, MPI_COMM_WORLD, &status); //printf("recv from %d\n",i); i = 1; } } } } /* end of probing */ header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { //if (i==rank) //continue; /* message arrived in this round (put in the header) */ if ((will_send[i] == 1) && (already_received[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_received[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; /* send header */ smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); /* recv data - pipeline */ from = header_buf[header_index - 1]; for (i = 0; i < pipe_length; i++) { smpi_mpi_recv(tmp_buf + (i * increment), segment, datatype, from, tag, comm, &status); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); } } } /* while loop (sent_count < size-1 ) */ } /* root */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request=smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header when required */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } /* (receive, reduce), and send data */ if (header_buf[myordering + 1] == -1) { to = 0; } else { to = header_buf[myordering + 1]; } /* send only */ if (myordering == 0) { for (i = 0; i < pipe_length; i++) { send_request_array[i]= smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } /* receive, reduce, and send */ else { from = header_buf[myordering - 1]; for (i = 0; i < pipe_length; i++) { recv_request_array[i]=smpi_mpi_irecv(tmp_buf + (i * increment), segment, datatype, from, tag, comm); } for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); smpi_op_apply(op, tmp_buf + (i * increment), (char *)rbuf + (i * increment), &segment, &datatype); send_request_array[i]=smpi_mpi_isend((char *)rbuf + (i * increment), segment, datatype, to, tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } } /* non-root */ free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); //printf("node %d done\n",rank); } /* end pipeline */ /* if root is not zero send root after finished this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == 0) { smpi_mpi_send(rbuf, count, datatype, root, tag, comm); } else if (rank == root) { smpi_mpi_recv(rbuf, count, datatype, 0, tag, comm, &status); } } /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { smpi_mpi_reduce((char *)buf + (pipe_length * increment), (char *)rbuf + (pipe_length * increment), remainder, datatype, op, root, comm); } free(tmp_buf); return MPI_SUCCESS; }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_scatter(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = -COLL_TAG_BCAST;//in order to use ANY_TAG, make this one positive int header_tag = 10; MPI_Status status; int curr_remainder; int curr_size; int curr_increment; int send_offset; int recv_offset; int send_count; int recv_count; MPI_Status temp_status_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int rank, size; int i, k; int sent_count; int header_index; int flag_array[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int already_sent[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int header_buf[BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE]; char temp_buf[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int will_send[BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE]; int max_node = BCAST_ARRIVAL_PATTERN_AWARE_MAX_NODE; int header_size = BCAST_ARRIVAL_PATTERN_AWARE_HEADER_SIZE; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* source and destination */ int to, from; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* message too small */ if (count < size) { XBT_WARN("MPI_bcast_arrival_scatter use default MPI_bcast."); smpi_mpi_bcast(buf, count, datatype, root, comm); return MPI_SUCCESS; } /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag - 1, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag - 1, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < max_node; i++) { already_sent[i] = 0; } /* start bcast */ /* root */ if (rank == 0) { for (i = 0; i < max_node; i++) will_send[i] = 0; sent_count = 0; while (sent_count < (size - 1)) { for (k = 0; k < 3; k++) { for (i = 1; i < size; i++) { if ((already_sent[i] == 0) && (will_send[i] == 0)) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], &temp_status_array[i]); if (flag_array[i] == 1) { will_send[i] = 1; smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); i = 0; } } } } header_index = 0; /* recv 1-byte message in this round */ for (i = 1; i < size; i++) { /* message arrive */ if ((will_send[i] == 1) && (already_sent[i] == 0)) { header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* if (header_index != 0) { printf("header index = %d node = ",header_index); for (i=0;i<header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); } */ /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; /* send header */ for (i = 0; i < header_index; i++) { to = header_buf[i]; smpi_mpi_send(header_buf, header_size, MPI_INT, to, header_tag, comm); } curr_remainder = count % header_index; curr_size = (count / header_index); curr_increment = curr_size * extent; /* send data */ for (i = 0; i < header_index; i++) { to = header_buf[i]; if ((i == (header_index - 1)) || (curr_size == 0)) curr_size += curr_remainder; //printf("Root send to %d index %d\n",to,(i*curr_increment)); smpi_mpi_send((char *) buf + (i * curr_increment), curr_size, datatype, to, tag, comm); } } } /* while (sent_count < size-1) */ } /* rank 0 */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ smpi_mpi_recv(header_buf, header_size, MPI_INT, 0, header_tag, comm, &status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } int total_nodes = 0; while (header_buf[total_nodes] != -1) { total_nodes++; } curr_remainder = count % total_nodes; curr_size = (count / total_nodes); curr_increment = curr_size * extent; int recv_size = curr_size; /* receive data */ if (myordering == (total_nodes - 1)) recv_size += curr_remainder; smpi_mpi_recv((char *) buf + (myordering * curr_increment), recv_size, datatype, 0, tag, comm, &status); /* at this point all nodes in this set perform all-gather operation */ to = header_buf[myordering + 1]; from = header_buf[myordering - 1]; if (myordering == 0) from = header_buf[total_nodes - 1]; if (myordering == (total_nodes - 1)) to = header_buf[0]; /* last segment may have a larger size since it also include the remainder */ int last_segment_ptr = (total_nodes - 1) * (count / total_nodes) * extent; /* allgather */ for (i = 0; i < total_nodes - 1; i++) { send_offset = ((myordering - i + total_nodes) % total_nodes) * curr_increment; recv_offset = ((myordering - i - 1 + total_nodes) % total_nodes) * curr_increment; /* adjust size */ if (send_offset != last_segment_ptr) send_count = curr_size; else send_count = curr_size + curr_remainder; if (recv_offset != last_segment_ptr) recv_count = curr_size; else recv_count = curr_size + curr_remainder; //printf("\t\tnode %d sent_to %d recv_from %d send_size %d recv_size %d\n",rank,to,from,send_count,recv_count); //printf("\tnode %d sent_offset %d send_count %d\n",rank,send_offset,send_count); smpi_mpi_sendrecv((char *) buf + send_offset, send_count, datatype, to, tag + i, (char *) buf + recv_offset, recv_count, datatype, from, tag + i, comm, &status); } } /* non-root */ return MPI_SUCCESS; }
/* Non-topology-specific pipelined linear-bcast function */ int smpi_coll_tuned_bcast_arrival_pattern_aware(void *buf, int count, MPI_Datatype datatype, int root, MPI_Comm comm) { int tag = -COLL_TAG_BCAST; MPI_Status status; MPI_Request request; MPI_Request *send_request_array; MPI_Request *recv_request_array; MPI_Status *send_status_array; MPI_Status *recv_status_array; MPI_Status temp_status_array[MAX_NODE]; int rank, size; int i, j; int sent_count; int header_index; int flag_array[MAX_NODE]; int already_sent[MAX_NODE]; int to_clean[MAX_NODE]; int header_buf[HEADER_SIZE]; char temp_buf[MAX_NODE]; MPI_Aint extent; extent = smpi_datatype_get_extent(datatype); /* destination */ int to; rank = smpi_comm_rank(comm); size = smpi_comm_size(comm); /* segment is segment size in number of elements (not bytes) */ int segment = bcast_NTSL_segment_size_in_byte / extent; segment = segment == 0 ? 1 :segment; /* pipeline length */ int pipe_length = count / segment; /* use for buffer offset for sending and receiving data = segment size in byte */ int increment = segment * extent; /* if the input size is not divisible by segment size => the small remainder will be done with native implementation */ int remainder = count % segment; /* if root is not zero send to rank zero first this can be modified to make it faster by using logical src, dst. */ if (root != 0) { if (rank == root) { smpi_mpi_send(buf, count, datatype, 0, tag, comm); } else if (rank == 0) { smpi_mpi_recv(buf, count, datatype, root, tag, comm, &status); } } /* value == 0 means root has not send data (or header) to the node yet */ for (i = 0; i < MAX_NODE; i++) { already_sent[i] = 0; to_clean[i] = 0; } /* when a message is smaller than a block size => no pipeline */ if (count <= segment) { if (rank == 0) { sent_count = 0; while (sent_count < (size - 1)) { for (i = 1; i < size; i++) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], MPI_STATUSES_IGNORE); } header_index = 0; /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((flag_array[i] == 1) && (already_sent[i] == 0)) { smpi_mpi_recv(temp_buf, 1, MPI_CHAR, i, tag, comm, &status); header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); smpi_mpi_send(buf, count, datatype, to, tag, comm); } /* randomly MPI_Send to one */ else { /* search for the first node that never received data before */ for (i = 1; i < size; i++) { if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, i, tag, comm); smpi_mpi_send(buf, count, datatype, i, tag, comm); already_sent[i] = 1; sent_count++; break; } } } } /* while loop */ } /* non-root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header and data, forward when required */ smpi_mpi_recv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm, &status); smpi_mpi_recv(buf, count, datatype, MPI_ANY_SOURCE, tag, comm, &status); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header followed by data */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); smpi_mpi_send(buf, count, datatype, header_buf[myordering + 1], tag, comm); } } } /* pipeline bcast */ else { send_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); recv_request_array = (MPI_Request *) xbt_malloc((size + pipe_length) * sizeof(MPI_Request)); send_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); recv_status_array = (MPI_Status *) xbt_malloc((size + pipe_length) * sizeof(MPI_Status)); if (rank == 0) { //double start2 = MPI_Wtime(); sent_count = 0; //int iteration = 0; while (sent_count < (size - 1)) { //iteration++; //start = MPI_Wtime(); for (i = 1; i < size; i++) { smpi_mpi_iprobe(i, MPI_ANY_TAG, comm, &flag_array[i], &temp_status_array[i]); } //total = MPI_Wtime() - start; //total *= 1000; //printf("Iprobe time = %.2f\n",total); header_index = 0; MPI_Wtime(); /* recv 1-byte message */ for (i = 1; i < size; i++) { /* message arrive */ if ((flag_array[i] == 1) && (already_sent[i] == 0)) { smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); header_buf[header_index] = i; header_index++; sent_count++; /* will send in the next step */ already_sent[i] = 1; } } //total = MPI_Wtime() - start; //total *= 1000; //printf("Recv 1-byte time = %.2f\n",total); /* if (header_index != 0) { printf("header index = %d node = ",header_index); for (i=0;i<header_index;i++) { printf("%d ",header_buf[i]); } printf("\n"); } */ /* send header followed by data */ if (header_index != 0) { header_buf[header_index] = -1; to = header_buf[0]; //start = MPI_Wtime(); /* send header */ smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); //total = MPI_Wtime() - start; //total *= 1000; //printf("\tSend header to %d time = %.2f\n",to,total); //start = MPI_Wtime(); /* send data - non-pipeline case */ if (0 == 1) { //if (header_index == 1) { smpi_mpi_send(buf, count, datatype, to, tag, comm); } /* send data - pipeline */ else { for (i = 0; i < pipe_length; i++) { smpi_mpi_send((char *)buf + (i * increment), segment, datatype, to, tag, comm); } //smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); } //total = MPI_Wtime() - start; //total *= 1000; //printf("\tSend data to %d time = %.2f\n",to,total); } /* randomly MPI_Send to one node */ else { /* search for the first node that never received data before */ for (i = 1; i < size; i++) { if (already_sent[i] == 0) { header_buf[0] = i; header_buf[1] = -1; to = i; //start = MPI_Wtime(); smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, to, tag, comm); /* still need to chop data so that we can use the same non-root code */ for (j = 0; j < pipe_length; j++) { smpi_mpi_send((char *)buf + (j * increment), segment, datatype, to, tag, comm); } //smpi_mpi_send(buf,count,datatype,to,tag,comm); //smpi_mpi_wait(&request,MPI_STATUS_IGNORE); //total = MPI_Wtime() - start; //total *= 1000; //printf("SEND TO SINGLE node %d time = %.2f\n",i,total); already_sent[i] = 1; to_clean[i]=1; sent_count++; break; } } } } /* while loop */ for(i=0; i<size; i++) if(to_clean[i]!=0)smpi_mpi_recv(&temp_buf[i], 1, MPI_CHAR, i, tag, comm, &status); //total = MPI_Wtime() - start2; //total *= 1000; //printf("Node zero iter = %d time = %.2f\n",iteration,total); } /* rank 0 */ /* none root */ else { /* send 1-byte message to root */ smpi_mpi_send(temp_buf, 1, MPI_CHAR, 0, tag, comm); /* wait for header forward when required */ request = smpi_mpi_irecv(header_buf, HEADER_SIZE, MPI_INT, MPI_ANY_SOURCE, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); /* search for where it is */ int myordering = 0; while (rank != header_buf[myordering]) { myordering++; } /* send header when required */ if (header_buf[myordering + 1] != -1) { smpi_mpi_send(header_buf, HEADER_SIZE, MPI_INT, header_buf[myordering + 1], tag, comm); } /* receive data */ if (0 == -1) { //if (header_buf[1] == -1) { request = smpi_mpi_irecv(buf, count, datatype, 0, tag, comm); smpi_mpi_wait(&request, MPI_STATUS_IGNORE); //printf("\t\tnode %d ordering = %d receive data from root\n",rank,myordering); } else { for (i = 0; i < pipe_length; i++) { recv_request_array[i] = smpi_mpi_irecv((char *)buf + (i * increment), segment, datatype, MPI_ANY_SOURCE, tag, comm); } } /* send data */ if (header_buf[myordering + 1] != -1) { for (i = 0; i < pipe_length; i++) { smpi_mpi_wait(&recv_request_array[i], MPI_STATUS_IGNORE); send_request_array[i] = smpi_mpi_isend((char *)buf + (i * increment), segment, datatype, header_buf[myordering + 1], tag, comm); } smpi_mpi_waitall((pipe_length), send_request_array, send_status_array); }else{ smpi_mpi_waitall(pipe_length, recv_request_array, recv_status_array); } } free(send_request_array); free(recv_request_array); free(send_status_array); free(recv_status_array); } /* end pipeline */ /* when count is not divisible by block size, use default BCAST for the remainder */ if ((remainder != 0) && (count > segment)) { XBT_WARN("MPI_bcast_arrival_pattern_aware use default MPI_bcast."); smpi_mpi_bcast((char *)buf + (pipe_length * increment), remainder, datatype, root, comm); } return MPI_SUCCESS; }