int smpi_coll_tuned_reduce_scatter_mpich_rdb(void *sendbuf, void *recvbuf, int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPI_Comm comm) { int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int mpi_errno = MPI_SUCCESS; int dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent =smpi_datatype_get_extent(datatype); smpi_datatype_extent(datatype, &true_lb, &true_extent); if (smpi_op_is_commute(op)) { is_commutative = 1; } disps = (int*)xbt_malloc( comm_size * sizeof(int)); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } /* noncommutative and (non-pof2 or block irregular), use recursive doubling. */ /* need to allocate temporary buffer to receive incoming data*/ tmp_recvbuf= (void *) xbt_malloc( total_count*(max(true_extent,extent))); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ tmp_results = (void *)xbt_malloc( total_count*(max(true_extent,extent))); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = smpi_datatype_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype); else mpi_errno = smpi_datatype_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype); if (mpi_errno) return(mpi_errno); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &sendtype); if (mpi_errno) return(mpi_errno); smpi_datatype_commit(&sendtype); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = smpi_datatype_indexed(2, blklens, dis, datatype, &recvtype); if (mpi_errno) return(mpi_errno); smpi_datatype_commit(&recvtype); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ smpi_mpi_sendrecv(tmp_results, 1, sendtype, dst, COLL_TAG_SCATTER, tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ smpi_mpi_send(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { smpi_mpi_recv(tmp_recvbuf, 1, recvtype, dst, COLL_TAG_SCATTER, comm, MPI_STATUS_IGNORE); received = 1; } tmp_mask >>= 1; k--; } } /* The following reduction is done here instead of after the MPIC_Sendrecv_ft or MPIC_Recv_ft above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { { smpi_op_apply(op, tmp_recvbuf, tmp_results, &blklens[0], &datatype); smpi_op_apply(op, ((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), &blklens[1], &datatype); } } else { { smpi_op_apply(op, tmp_results, tmp_recvbuf, &blklens[0], &datatype); smpi_op_apply(op, ((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), &blklens[1], &datatype); } /* copy result back into tmp_results */ mpi_errno = smpi_datatype_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype); if (mpi_errno) return(mpi_errno); } } //smpi_datatype_free(&sendtype); //smpi_datatype_free(&recvtype); mask <<= 1; i++; }
int smpi_coll_tuned_allgatherv_ompi_neighborexchange(void *sbuf, int scount, MPI_Datatype sdtype, void* rbuf, int *rcounts, int *rdispls, MPI_Datatype rdtype, MPI_Comm comm) { int line = -1; int rank, size; int neighbor[2], offset_at_step[2], recv_data_from[2], send_data_from; int i, even_rank; int err = 0; ptrdiff_t slb, rlb, sext, rext; char *tmpsend = NULL, *tmprecv = NULL; size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (size % 2) { XBT_DEBUG( "coll:tuned:allgatherv_ompi_neighborexchange WARNING: odd size %d, switching to ring algorithm", size); return smpi_coll_tuned_allgatherv_ring(sbuf, scount, sdtype, rbuf, rcounts, rdispls, rdtype, comm); } XBT_DEBUG( "coll:tuned:allgatherv_ompi_neighborexchange rank %d", rank); err = smpi_datatype_extent (sdtype, &slb, &sext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = smpi_datatype_extent (rdtype, &rlb, &rext); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* Initialization step: - if send buffer is not MPI_IN_PLACE, copy send buffer to the appropriate block of receive buffer */ tmprecv = (char*) rbuf + rdispls[rank] * rext; if (MPI_IN_PLACE != sbuf) { tmpsend = (char*) sbuf; err = smpi_datatype_copy(tmpsend, scount, sdtype, tmprecv, rcounts[rank], rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } /* Determine neighbors, order in which blocks will arrive, etc. */ even_rank = !(rank % 2); if (even_rank) { neighbor[0] = (rank + 1) % size; neighbor[1] = (rank - 1 + size) % size; recv_data_from[0] = rank; recv_data_from[1] = rank; offset_at_step[0] = (+2); offset_at_step[1] = (-2); } else { neighbor[0] = (rank - 1 + size) % size; neighbor[1] = (rank + 1) % size; recv_data_from[0] = neighbor[0]; recv_data_from[1] = neighbor[0]; offset_at_step[0] = (-2); offset_at_step[1] = (+2); } /* Communication loop: - First step is special: exchange a single block with neighbor[0]. - Rest of the steps: update recv_data_from according to offset, and exchange two blocks with appropriate neighbor. the send location becomes previous receve location. Note, we need to create indexed datatype to send and receive these blocks properly. */ tmprecv = (char*)rbuf + rdispls[neighbor[0]] * rext; tmpsend = (char*)rbuf + rdispls[rank] * rext; smpi_mpi_sendrecv(tmpsend, rcounts[rank], rdtype, neighbor[0], COLL_TAG_ALLGATHERV, tmprecv, rcounts[neighbor[0]], rdtype, neighbor[0], COLL_TAG_ALLGATHERV, comm, MPI_STATUS_IGNORE); /* Determine initial sending counts and displacements*/ if (even_rank) { send_data_from = rank; } else { send_data_from = recv_data_from[0]; } for (i = 1; i < (size / 2); i++) { MPI_Datatype new_rdtype, new_sdtype; int new_scounts[2], new_sdispls[2], new_rcounts[2], new_rdispls[2]; const int i_parity = i % 2; recv_data_from[i_parity] = (recv_data_from[i_parity] + offset_at_step[i_parity] + size) % size; /* Create new indexed types for sending and receiving. We are sending data from ranks (send_data_from) and (send_data_from+1) We are receiving data from ranks (recv_data_from[i_parity]) and (recv_data_from[i_parity]+1). */ new_scounts[0] = rcounts[send_data_from]; new_scounts[1] = rcounts[(send_data_from + 1)]; new_sdispls[0] = rdispls[send_data_from]; new_sdispls[1] = rdispls[(send_data_from + 1)]; err = smpi_datatype_indexed(2, new_scounts, new_sdispls, rdtype, &new_sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } smpi_datatype_commit(&new_sdtype); new_rcounts[0] = rcounts[recv_data_from[i_parity]]; new_rcounts[1] = rcounts[(recv_data_from[i_parity] + 1)]; new_rdispls[0] = rdispls[recv_data_from[i_parity]]; new_rdispls[1] = rdispls[(recv_data_from[i_parity] + 1)]; err = smpi_datatype_indexed(2, new_rcounts, new_rdispls, rdtype, &new_rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } smpi_datatype_commit(&new_rdtype); tmprecv = (char*)rbuf; tmpsend = (char*)rbuf; /* Sendreceive */ smpi_mpi_sendrecv(tmpsend, 1, new_sdtype, neighbor[i_parity], COLL_TAG_ALLGATHERV, tmprecv, 1, new_rdtype, neighbor[i_parity], COLL_TAG_ALLGATHERV, comm, MPI_STATUS_IGNORE); send_data_from = recv_data_from[i_parity]; smpi_datatype_free(&new_sdtype); smpi_datatype_free(&new_rdtype); } return MPI_SUCCESS; err_hndl: XBT_DEBUG( "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank); return err; }
/***************************************************************************** * Function: alltoall_bruck * Return: int * inputs: send_buff: send input buffer send_count: number of elements to send send_type: data type of elements being sent recv_buff: receive output buffer recv_count: number of elements to received recv_type: data type of elements being received comm: communicator * Descrp: Function realizes the alltoall operation using the bruck algorithm. * Auther: MPICH / modified by Ahmad Faraj ****************************************************************************/ int smpi_coll_tuned_alltoall_bruck(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int recv_count, MPI_Datatype recv_type, MPI_Comm comm) { MPI_Status status; MPI_Aint extent; MPI_Datatype new_type; int *blocks_length, *disps; int i, src, dst, rank, num_procs, count, remainder, block, position; int pack_size, tag = COLL_TAG_ALLTOALL, pof2 = 1; char *tmp_buff; char *send_ptr = (char *) send_buff; char *recv_ptr = (char *) recv_buff; num_procs = smpi_comm_size(comm); rank = smpi_comm_rank(comm); extent = smpi_datatype_get_extent(recv_type); tmp_buff = (char *) smpi_get_tmp_sendbuffer(num_procs * recv_count * extent); disps = (int *) xbt_malloc(sizeof(int) * num_procs); blocks_length = (int *) xbt_malloc(sizeof(int) * num_procs); smpi_mpi_sendrecv(send_ptr + rank * send_count * extent, (num_procs - rank) * send_count, send_type, rank, tag, recv_ptr, (num_procs - rank) * recv_count, recv_type, rank, tag, comm, &status); smpi_mpi_sendrecv(send_ptr, rank * send_count, send_type, rank, tag, recv_ptr + (num_procs - rank) * recv_count * extent, rank * recv_count, recv_type, rank, tag, comm, &status); MPI_Pack_size(send_count * num_procs, send_type, comm, &pack_size); while (pof2 < num_procs) { dst = (rank + pof2) % num_procs; src = (rank - pof2 + num_procs) % num_procs; count = 0; for (block = 1; block < num_procs; block++) if (block & pof2) { blocks_length[count] = send_count; disps[count] = block * send_count; count++; } MPI_Type_indexed(count, blocks_length, disps, recv_type, &new_type); smpi_datatype_commit(&new_type); position = 0; MPI_Pack(recv_buff, 1, new_type, tmp_buff, pack_size, &position, comm); smpi_mpi_sendrecv(tmp_buff, position, MPI_PACKED, dst, tag, recv_buff, 1, new_type, src, tag, comm, &status); smpi_datatype_unuse(new_type); pof2 *= 2; } free(disps); free(blocks_length); smpi_mpi_sendrecv(recv_ptr + (rank + 1) * recv_count * extent, (num_procs - rank - 1) * recv_count, send_type, rank, tag, tmp_buff, (num_procs - rank - 1) * recv_count, recv_type, rank, tag, comm, &status); smpi_mpi_sendrecv(recv_ptr, (rank + 1) * recv_count, send_type, rank, tag, tmp_buff + (num_procs - rank - 1) * recv_count * extent, (rank + 1) * recv_count, recv_type, rank, tag, comm, &status); for (i = 0; i < num_procs; i++) smpi_mpi_sendrecv(tmp_buff + i * recv_count * extent, recv_count, send_type, rank, tag, recv_ptr + (num_procs - i - 1) * recv_count * extent, recv_count, recv_type, rank, tag, comm, &status); smpi_free_tmp_buffer(tmp_buff); return MPI_SUCCESS; }