// Allgather - gather/bcast algorithm int smpi_coll_tuned_allgatherv_GB(void *send_buff, int send_count, MPI_Datatype send_type, void *recv_buff, int *recv_counts, int *recv_disps, MPI_Datatype recv_type, MPI_Comm comm) { smpi_mpi_gatherv(send_buff, send_count, send_type, recv_buff, recv_counts, recv_disps, recv_type, 0, comm); int num_procs, i, current, max = 0; num_procs = smpi_comm_size(comm); for (i = 0; i < num_procs; i++) { current = recv_disps[i] + recv_counts[i]; if (current > max) max = current; } mpi_coll_bcast_fun(recv_buff, max, recv_type, 0, comm); return MPI_SUCCESS; }
int smpi_coll_tuned_gather_mvapich2_two_level(void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm) { void *leader_gather_buf = NULL; int comm_size, rank; int local_rank, local_size; int leader_comm_rank = -1, leader_comm_size = 0; int mpi_errno = MPI_SUCCESS; int recvtype_size = 0, sendtype_size = 0, nbytes=0; int leader_root, leader_of_root; MPI_Status status; MPI_Aint sendtype_extent = 0, recvtype_extent = 0; /* Datatype extent */ MPI_Aint true_lb, sendtype_true_extent, recvtype_true_extent; MPI_Comm shmem_comm, leader_comm; void* tmp_buf = NULL; //if not set (use of the algo directly, without mvapich2 selector) if(MV2_Gather_intra_node_function==NULL) MV2_Gather_intra_node_function=smpi_coll_tuned_gather_mpich; if(smpi_comm_get_leaders_comm(comm)==MPI_COMM_NULL){ smpi_comm_init_smp(comm); } comm_size = smpi_comm_size(comm); rank = smpi_comm_rank(comm); if (((rank == root) && (recvcnt == 0)) || ((rank != root) && (sendcnt == 0))) { return MPI_SUCCESS; } if (sendtype != MPI_DATATYPE_NULL) { sendtype_extent=smpi_datatype_get_extent(sendtype); sendtype_size=smpi_datatype_size(sendtype); smpi_datatype_extent(sendtype, &true_lb, &sendtype_true_extent); } if (recvtype != MPI_DATATYPE_NULL) { recvtype_extent=smpi_datatype_get_extent(recvtype); recvtype_size=smpi_datatype_size(recvtype); smpi_datatype_extent(recvtype, &true_lb, &recvtype_true_extent); } /* extract the rank,size information for the intra-node * communicator */ shmem_comm = smpi_comm_get_intra_comm(comm); local_rank = smpi_comm_rank(shmem_comm); local_size = smpi_comm_size(shmem_comm); if (local_rank == 0) { /* Node leader. Extract the rank, size information for the leader * communicator */ leader_comm = smpi_comm_get_leaders_comm(comm); if(leader_comm==MPI_COMM_NULL){ leader_comm = MPI_COMM_WORLD; } leader_comm_size = smpi_comm_size(leader_comm); leader_comm_rank = smpi_comm_rank(leader_comm); } if (rank == root) { nbytes = recvcnt * recvtype_size; } else { nbytes = sendcnt * sendtype_size; } #if defined(_SMP_LIMIC_) if((g_use_limic2_coll) && (shmem_commptr->ch.use_intra_sock_comm == 1) && (use_limic_gather) &&((num_scheme == USE_GATHER_PT_PT_BINOMIAL) || (num_scheme == USE_GATHER_PT_PT_DIRECT) ||(num_scheme == USE_GATHER_PT_LINEAR_BINOMIAL) || (num_scheme == USE_GATHER_PT_LINEAR_DIRECT) || (num_scheme == USE_GATHER_LINEAR_PT_BINOMIAL) || (num_scheme == USE_GATHER_LINEAR_PT_DIRECT) || (num_scheme == USE_GATHER_LINEAR_LINEAR) || (num_scheme == USE_GATHER_SINGLE_LEADER))) { mpi_errno = MV2_Gather_intra_node_function(sendbuf, sendcnt, sendtype, recvbuf, recvcnt,recvtype, root, comm); } else #endif/*#if defined(_SMP_LIMIC_)*/ { if (local_rank == 0) { /* Node leader, allocate tmp_buffer */ if (rank == root) { tmp_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * local_size); } else { tmp_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * local_size); } if (tmp_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } /*while testing mpich2 gather test, we see that * which basically splits the comm, and we come to * a point, where use_intra_sock_comm == 0, but if the * intra node function is MPIR_Intra_node_LIMIC_Gather_MV2, * it would use the intra sock comm. In such cases, we * fallback to binomial as a default case.*/ #if defined(_SMP_LIMIC_) if(*MV2_Gather_intra_node_function == MPIR_Intra_node_LIMIC_Gather_MV2) { mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_commptr, MPIR_Gather_intra); } else #endif { /*We are gathering the data into tmp_buf and the output * will be of MPI_BYTE datatype. Since the tmp_buf has no * local data, we pass is_data_avail = TEMP_BUF_HAS_NO_DATA*/ mpi_errno = MPIR_pt_pt_intra_gather(sendbuf,sendcnt, sendtype, recvbuf, recvcnt, recvtype, root, rank, tmp_buf, nbytes, TEMP_BUF_HAS_NO_DATA, shmem_comm, MV2_Gather_intra_node_function ); } } leader_comm = smpi_comm_get_leaders_comm(comm); int* leaders_map = smpi_comm_get_leaders_map(comm); leader_of_root = smpi_group_rank(smpi_comm_group(comm),leaders_map[root]); leader_root = smpi_group_rank(smpi_comm_group(leader_comm),leaders_map[root]); /* leader_root is the rank of the leader of the root in leader_comm. * leader_root is to be used as the root of the inter-leader gather ops */ if (!smpi_comm_is_uniform(comm)) { if (local_rank == 0) { int *displs = NULL; int *recvcnts = NULL; int *node_sizes; int i = 0; /* Node leaders have all the data. But, different nodes can have * different number of processes. Do a Gather first to get the * buffer lengths at each leader, followed by a Gatherv to move * the actual data */ if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level * leader and this process's rank in the leader_comm * is the same as leader_root */ if(rank == root) { leader_gather_buf = smpi_get_tmp_recvbuffer(recvcnt * MAX(recvtype_extent, recvtype_true_extent) * comm_size); } else { leader_gather_buf = smpi_get_tmp_sendbuffer(sendcnt * MAX(sendtype_extent, sendtype_true_extent) * comm_size); } if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } node_sizes = smpi_comm_get_non_uniform_map(comm); if (leader_comm_rank == leader_root) { displs = xbt_malloc(sizeof (int) * leader_comm_size); recvcnts = xbt_malloc(sizeof (int) * leader_comm_size); if (!displs || !recvcnts) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { /* The root of the gather operation is also the node * leader. Receive into recvbuf and we are done */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * recvcnt; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * recvcnt; recvcnts[i] = node_sizes[i] * recvcnt; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, recvbuf, recvcnts, displs, recvtype, leader_root, leader_comm); } else { /* The root of the gather operation is not the node leader. * Receive into leader_gather_buf and then send * to the root */ if (leader_comm_rank == leader_root) { recvcnts[0] = node_sizes[0] * nbytes; displs[0] = 0; for (i = 1; i < leader_comm_size; i++) { displs[i] = displs[i - 1] + node_sizes[i - 1] * nbytes; recvcnts[i] = node_sizes[i] * nbytes; } } smpi_mpi_gatherv(tmp_buf, local_size * nbytes, MPI_BYTE, leader_gather_buf, recvcnts, displs, MPI_BYTE, leader_root, leader_comm); } if (leader_comm_rank == leader_root) { xbt_free(displs); xbt_free(recvcnts); } } } else { /* All nodes have the same number of processes. * Just do one Gather to get all * the data at the leader of the root process */ if (local_rank == 0) { if (leader_comm_rank == leader_root && root != leader_of_root) { /* The root of the Gather operation is not a node-level leader */ leader_gather_buf = smpi_get_tmp_sendbuffer(nbytes * comm_size); if (leader_gather_buf == NULL) { mpi_errno = MPI_ERR_OTHER; return mpi_errno; } } if (root == leader_of_root) { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, recvbuf, recvcnt * local_size, recvtype, leader_root, leader_comm); } else { mpi_errno = MPIR_Gather_MV2_Direct(tmp_buf, nbytes * local_size, MPI_BYTE, leader_gather_buf, nbytes * local_size, MPI_BYTE, leader_root, leader_comm); } } } if ((local_rank == 0) && (root != rank) && (leader_of_root == rank)) { smpi_mpi_send(leader_gather_buf, nbytes * comm_size, MPI_BYTE, root, COLL_TAG_GATHER, comm); } if (rank == root && local_rank != 0) { /* The root of the gather operation is not the node leader. Receive y* data from the node leader */ smpi_mpi_recv(recvbuf, recvcnt * comm_size, recvtype, leader_of_root, COLL_TAG_GATHER, comm, &status); } /* check if multiple threads are calling this collective function */ if (local_rank == 0 ) { if (tmp_buf != NULL) { smpi_free_tmp_buffer(tmp_buf); } if (leader_gather_buf != NULL) { smpi_free_tmp_buffer(leader_gather_buf); } } return (mpi_errno); }