int MPIR_Alltoall_intra( const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, int *errflag ) { int comm_size, i, j, pof2; MPI_Aint sendtype_extent, recvtype_extent; MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb; int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes; int mpi_errno_ret = MPI_SUCCESS; MPI_Status status; int sendtype_size, pack_size, block, position, *displs, count; MPI_Datatype newtype = MPI_DATATYPE_NULL; void *tmp_buf; MPI_Comm comm; MPI_Request *reqarray; MPI_Status *starray; MPIU_CHKLMEM_DECL(6); #ifdef MPIR_OLD_SHORT_ALLTOALL_ALG MPI_Aint sendtype_true_extent, sendbuf_extent, sendtype_true_lb; int k, p, curr_cnt, dst_tree_root, my_tree_root; int last_recv_cnt, mask, tmp_mask, tree_root, nprocs_completed; #endif if (recvcount == 0) return MPI_SUCCESS; comm = comm_ptr->handle; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* Get extent of send and recv types */ MPID_Datatype_get_extent_macro(recvtype, recvtype_extent); MPID_Datatype_get_extent_macro(sendtype, sendtype_extent); MPID_Datatype_get_size_macro(sendtype, sendtype_size); nbytes = sendtype_size * sendcount; /* check if multiple threads are calling this collective function */ MPIDU_ERR_CHECK_MULTIPLE_THREADS_ENTER( comm_ptr ); if (sendbuf == MPI_IN_PLACE) { /* We use pair-wise sendrecv_replace in order to conserve memory usage, * which is keeping with the spirit of the MPI-2.2 Standard. But * because of this approach all processes must agree on the global * schedule of sendrecv_replace operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of * time and there will be multiple repeated malloc/free's rather than * maintaining a single buffer across the whole loop. Something like * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */ for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i) { /* also covers the (rank == i && rank == j) case */ mpi_errno = MPIC_Sendrecv_replace_ft(((char *)recvbuf + j*recvcount*recvtype_extent), recvcount, recvtype, j, MPIR_ALLTOALL_TAG, j, MPIR_ALLTOALL_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } else if (rank == j) { /* same as above with i/j args reversed */ mpi_errno = MPIC_Sendrecv_replace_ft(((char *)recvbuf + i*recvcount*recvtype_extent), recvcount, recvtype, i, MPIR_ALLTOALL_TAG, i, MPIR_ALLTOALL_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } } else if ((nbytes <= MPIR_PARAM_ALLTOALL_SHORT_MSG_SIZE) && (comm_size >= 8)) { /* use the indexing algorithm by Jehoshua Bruck et al, * IEEE TPDS, Nov. 97 */ /* allocate temporary buffer */ MPIR_Pack_size_impl(recvcount*comm_size, recvtype, &pack_size); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, pack_size, mpi_errno, "tmp_buf"); /* Do Phase 1 of the algorithim. Shift the data blocks on process i * upwards by a distance of i blocks. Store the result in recvbuf. */ mpi_errno = MPIR_Localcopy((char *) sendbuf + rank*sendcount*sendtype_extent, (comm_size - rank)*sendcount, sendtype, recvbuf, (comm_size - rank)*recvcount, recvtype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = MPIR_Localcopy(sendbuf, rank*sendcount, sendtype, (char *) recvbuf + (comm_size-rank)*recvcount*recvtype_extent, rank*recvcount, recvtype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* Input data is now stored in recvbuf with datatype recvtype */ /* Now do Phase 2, the communication phase. It takes ceiling(lg p) steps. In each step i, each process sends to rank+2^i and receives from rank-2^i, and exchanges all data blocks whose ith bit is 1. */ /* allocate displacements array for indexed datatype used in communication */ MPIU_CHKLMEM_MALLOC(displs, int *, comm_size * sizeof(int), mpi_errno, "displs"); pof2 = 1; while (pof2 < comm_size) { dst = (rank + pof2) % comm_size; src = (rank - pof2 + comm_size) % comm_size; /* Exchange all data blocks whose ith bit is 1 */ /* Create an indexed datatype for the purpose */ count = 0; for (block=1; block<comm_size; block++) { if (block & pof2) { displs[count] = block * recvcount; count++; } } mpi_errno = MPIR_Type_create_indexed_block_impl(count, recvcount, displs, recvtype, &newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); position = 0; mpi_errno = MPIR_Pack_impl(recvbuf, 1, newtype, tmp_buf, pack_size, &position); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIC_Sendrecv_ft(tmp_buf, position, MPI_PACKED, dst, MPIR_ALLTOALL_TAG, recvbuf, 1, newtype, src, MPIR_ALLTOALL_TAG, comm, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } MPIR_Type_free_impl(&newtype); pof2 *= 2; } /* Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need * a temporary buffer of the same size as recvbuf. */ /* get true extent of recvtype */ MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent); recvbuf_extent = recvcount * comm_size * (MPIR_MAX(recvtype_true_extent, recvtype_extent)); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, recvbuf_extent, mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb); mpi_errno = MPIR_Localcopy((char *) recvbuf + (rank+1)*recvcount*recvtype_extent, (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, (comm_size - rank - 1)*recvcount, recvtype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = MPIR_Localcopy(recvbuf, (rank+1)*recvcount, recvtype, (char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent, (rank+1)*recvcount, recvtype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* Blocks are in the reverse order now (comm_size-1 to 0). * Reorder them to (0 to comm_size-1) and store them in recvbuf. */ for (i=0; i<comm_size; i++){ mpi_errno = MPIR_Localcopy((char *) tmp_buf + i*recvcount*recvtype_extent, recvcount, recvtype, (char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent, recvcount, recvtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } #ifdef MPIR_OLD_SHORT_ALLTOALL_ALG /* Short message. Use recursive doubling. Each process sends all its data at each step along with all data it received in previous steps. */ /* need to allocate temporary buffer of size sendbuf_extent*comm_size */ /* get true extent of sendtype */ MPIR_Type_get_true_extent_impl(sendtype, &sendtype_true_lb, &sendtype_true_extent); sendbuf_extent = sendcount * comm_size * (MPIR_MAX(sendtype_true_extent, sendtype_extent)); MPIU_CHKLMEM_MALLOC(tmp_buf, void *, sendbuf_extent*comm_size, mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - sendtype_true_lb); /* copy local sendbuf into tmp_buf at location indexed by rank */ curr_cnt = sendcount*comm_size; mpi_errno = MPIR_Localcopy(sendbuf, curr_cnt, sendtype, ((char *)tmp_buf + rank*sendbuf_extent), curr_cnt, sendtype); if (mpi_errno) { MPIU_ERR_POP(mpi_errno);} mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; if (dst < comm_size) { mpi_errno = MPIC_Sendrecv_ft(((char *)tmp_buf + my_tree_root*sendbuf_extent), curr_cnt, sendtype, dst, MPIR_ALLTOALL_TAG, ((char *)tmp_buf + dst_tree_root*sendbuf_extent), sendbuf_extent*(comm_size-dst_tree_root), sendtype, dst, MPIR_ALLTOALL_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); last_recv_cnt = 0; } else /* in case of non-power-of-two nodes, less data may be received than specified */ MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt); curr_cnt += last_recv_cnt; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the data received in this step above */ mpi_errno = MPIC_Send_ft(((char *)tmp_buf + dst_tree_root*sendbuf_extent), last_recv_cnt, sendtype, dst, MPIR_ALLTOALL_TAG, comm, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); } } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { mpi_errno = MPIC_Recv_ft(((char *)tmp_buf + dst_tree_root*sendbuf_extent), sendbuf_extent*(comm_size-dst_tree_root), sendtype, dst, MPIR_ALLTOALL_TAG, comm, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = TRUE; MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**fail"); MPIU_ERR_ADD(mpi_errno_ret, mpi_errno); last_recv_cnt = 0; } else MPIR_Get_count_impl(&status, sendtype, &last_recv_cnt); curr_cnt += last_recv_cnt; } tmp_mask >>= 1; k--; } } mask <<= 1; i++; }
int MPIR_Ialltoall_bruck(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPID_Comm *comm_ptr, MPID_Sched_t s) { int mpi_errno = MPI_SUCCESS; int i; int nbytes, recvtype_size, recvbuf_extent, newtype_size; int rank, comm_size; void *tmp_buf = NULL; MPI_Aint sendtype_extent, recvtype_extent, recvtype_true_lb, recvtype_true_extent; int pof2, dst, src; int count, block; MPI_Datatype newtype; int *displs; MPIU_CHKLMEM_DECL(1); /* displs */ MPIR_SCHED_CHKPMEM_DECL(2); /* tmp_buf (2x) */ MPIU_Assert(sendbuf != MPI_IN_PLACE); /* we do not handle in-place */ comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPID_Datatype_get_extent_macro(sendtype, sendtype_extent); MPID_Datatype_get_size_macro(recvtype, recvtype_size); MPID_Datatype_get_extent_macro(recvtype, recvtype_extent); /* allocate temporary buffer */ /* must be same size as entire recvbuf for Phase 3 */ nbytes = recvtype_size * recvcount * comm_size; MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, nbytes, mpi_errno, "tmp_buf"); /* Do Phase 1 of the algorithim. Shift the data blocks on process i * upwards by a distance of i blocks. Store the result in recvbuf. */ mpi_errno = MPID_Sched_copy(((char *) sendbuf + rank*sendcount*sendtype_extent), (comm_size - rank)*sendcount, sendtype, recvbuf, (comm_size - rank)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_copy(sendbuf, rank*sendcount, sendtype, ((char *) recvbuf + (comm_size-rank)*recvcount*recvtype_extent), rank*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* Input data is now stored in recvbuf with datatype recvtype */ /* Now do Phase 2, the communication phase. It takes ceiling(lg p) steps. In each step i, each process sends to rank+2^i and receives from rank-2^i, and exchanges all data blocks whose ith bit is 1. */ /* allocate displacements array for indexed datatype used in communication */ MPIU_CHKLMEM_MALLOC(displs, int *, comm_size * sizeof(int), mpi_errno, "displs"); pof2 = 1; while (pof2 < comm_size) { dst = (rank + pof2) % comm_size; src = (rank - pof2 + comm_size) % comm_size; /* Exchange all data blocks whose ith bit is 1 */ /* Create an indexed datatype for the purpose */ count = 0; for (block=1; block<comm_size; block++) { if (block & pof2) { displs[count] = block * recvcount; count++; } } mpi_errno = MPIR_Type_create_indexed_block_impl(count, recvcount, displs, recvtype, &newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&newtype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_Datatype_get_size_macro(newtype, newtype_size); /* we will usually copy much less than nbytes */ mpi_errno = MPID_Sched_copy(recvbuf, 1, newtype, tmp_buf, newtype_size, MPI_BYTE, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* now send and recv in parallel */ mpi_errno = MPID_Sched_send(tmp_buf, newtype_size, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_recv(recvbuf, 1, newtype, src, comm_ptr, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); MPIR_Type_free_impl(&newtype); pof2 *= 2; } /* Phase 3: Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need * a temporary buffer of the same size as recvbuf. */ /* get true extent of recvtype */ MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent); recvbuf_extent = recvcount * comm_size * (MPIR_MAX(recvtype_true_extent, recvtype_extent)); /* not a leak, old tmp_buf value is still tracked by CHKPMEM macros */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, recvbuf_extent, mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb); mpi_errno = MPID_Sched_copy(((char *) recvbuf + (rank+1)*recvcount*recvtype_extent), (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, (comm_size - rank - 1)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPID_Sched_copy(recvbuf, (rank+1)*recvcount, recvtype, ((char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent), (rank+1)*recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPID_SCHED_BARRIER(s); /* Blocks are in the reverse order now (comm_size-1 to 0). * Reorder them to (0 to comm_size-1) and store them in recvbuf. */ for (i = 0; i < comm_size; i++){ mpi_errno = MPID_Sched_copy(((char *) tmp_buf + i*recvcount*recvtype_extent), recvcount, recvtype, ((char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent), recvcount, recvtype, s); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } MPIR_SCHED_CHKPMEM_COMMIT(s); fn_exit: MPIU_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: MPIR_SCHED_CHKPMEM_REAP(s); goto fn_exit; }
int MPIR_Ireduce_scatter_sched_intra_recursive_doubling(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); is_commutative = MPIR_Op_is_commutative(op); MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* total_count*extent eventually gets malloced. it isn't added to * a user-passed in buffer */ MPIR_Ensure_Aint_fits_in_pointer(total_count * MPL_MAX(true_extent, extent)); /* need to allocate temporary buffer to receive incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ mpi_errno = MPIR_Sched_send(tmp_results, 1, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ mpi_errno = MPIR_Sched_send(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } tmp_mask >>= 1; k--; } } /* N.B. The following comment comes from the FT version of * MPI_Reduce_scatter. It does not currently apply to this code, but * will in the future when we update the NBC code to be fault-tolerant * in roughly the same fashion. [goodell@ 2011-03-03] */ /* The following reduction is done here instead of after the MPIC_Sendrecv or MPIC_Recv above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(tmp_results, tmp_recvbuf, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* copy result back into tmp_results */ mpi_errno = MPIR_Sched_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } MPIR_Type_free_impl(&sendtype); MPIR_Type_free_impl(&recvtype); mask <<= 1; i++; }
int MPIR_Alltoall_intra( const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPIR_Comm *comm_ptr, MPIR_Errflag_t *errflag ) { int comm_size, i, j, pof2; MPI_Aint sendtype_extent, recvtype_extent; MPI_Aint recvtype_true_extent, recvbuf_extent, recvtype_true_lb; int mpi_errno=MPI_SUCCESS, src, dst, rank, nbytes; int mpi_errno_ret = MPI_SUCCESS; MPI_Status status; int sendtype_size, block, *displs, count; MPI_Aint pack_size, position; MPI_Datatype newtype = MPI_DATATYPE_NULL; void *tmp_buf; MPIR_Request **reqarray; MPI_Status *starray; MPIR_CHKLMEM_DECL(6); if (recvcount == 0) return MPI_SUCCESS; comm_size = comm_ptr->local_size; rank = comm_ptr->rank; /* Get extent of send and recv types */ MPID_Datatype_get_extent_macro(recvtype, recvtype_extent); MPID_Datatype_get_extent_macro(sendtype, sendtype_extent); MPID_Datatype_get_size_macro(sendtype, sendtype_size); nbytes = sendtype_size * sendcount; if (sendbuf == MPI_IN_PLACE) { /* We use pair-wise sendrecv_replace in order to conserve memory usage, * which is keeping with the spirit of the MPI-2.2 Standard. But * because of this approach all processes must agree on the global * schedule of sendrecv_replace operations to avoid deadlock. * * Note that this is not an especially efficient algorithm in terms of * time and there will be multiple repeated malloc/free's rather than * maintaining a single buffer across the whole loop. Something like * MADRE is probably the best solution for the MPI_IN_PLACE scenario. */ for (i = 0; i < comm_size; ++i) { /* start inner loop at i to avoid re-exchanging data */ for (j = i; j < comm_size; ++j) { if (rank == i) { /* also covers the (rank == i && rank == j) case */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + j*recvcount*recvtype_extent), recvcount, recvtype, j, MPIR_ALLTOALL_TAG, j, MPIR_ALLTOALL_TAG, comm_ptr, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } else if (rank == j) { /* same as above with i/j args reversed */ mpi_errno = MPIC_Sendrecv_replace(((char *)recvbuf + i*recvcount*recvtype_extent), recvcount, recvtype, i, MPIR_ALLTOALL_TAG, i, MPIR_ALLTOALL_TAG, comm_ptr, &status, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } } } } } else if ((nbytes <= MPIR_CVAR_ALLTOALL_SHORT_MSG_SIZE) && (comm_size >= 8)) { /* use the indexing algorithm by Jehoshua Bruck et al, * IEEE TPDS, Nov. 97 */ /* allocate temporary buffer */ MPIR_Pack_size_impl(recvcount*comm_size, recvtype, &pack_size); MPIR_CHKLMEM_MALLOC(tmp_buf, void *, pack_size, mpi_errno, "tmp_buf"); /* Do Phase 1 of the algorithim. Shift the data blocks on process i * upwards by a distance of i blocks. Store the result in recvbuf. */ mpi_errno = MPIR_Localcopy((char *) sendbuf + rank*sendcount*sendtype_extent, (comm_size - rank)*sendcount, sendtype, recvbuf, (comm_size - rank)*recvcount, recvtype); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } mpi_errno = MPIR_Localcopy(sendbuf, rank*sendcount, sendtype, (char *) recvbuf + (comm_size-rank)*recvcount*recvtype_extent, rank*recvcount, recvtype); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } /* Input data is now stored in recvbuf with datatype recvtype */ /* Now do Phase 2, the communication phase. It takes ceiling(lg p) steps. In each step i, each process sends to rank+2^i and receives from rank-2^i, and exchanges all data blocks whose ith bit is 1. */ /* allocate displacements array for indexed datatype used in communication */ MPIR_CHKLMEM_MALLOC(displs, int *, comm_size * sizeof(int), mpi_errno, "displs"); pof2 = 1; while (pof2 < comm_size) { dst = (rank + pof2) % comm_size; src = (rank - pof2 + comm_size) % comm_size; /* Exchange all data blocks whose ith bit is 1 */ /* Create an indexed datatype for the purpose */ count = 0; for (block=1; block<comm_size; block++) { if (block & pof2) { displs[count] = block * recvcount; count++; } } mpi_errno = MPIR_Type_create_indexed_block_impl(count, recvcount, displs, recvtype, &newtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&newtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); position = 0; mpi_errno = MPIR_Pack_impl(recvbuf, 1, newtype, tmp_buf, pack_size, &position); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIC_Sendrecv(tmp_buf, position, MPI_PACKED, dst, MPIR_ALLTOALL_TAG, recvbuf, 1, newtype, src, MPIR_ALLTOALL_TAG, comm_ptr, MPI_STATUS_IGNORE, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIR_ERR_GET_CLASS(mpi_errno); MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } MPIR_Type_free_impl(&newtype); pof2 *= 2; } /* Rotate blocks in recvbuf upwards by (rank + 1) blocks. Need * a temporary buffer of the same size as recvbuf. */ /* get true extent of recvtype */ MPIR_Type_get_true_extent_impl(recvtype, &recvtype_true_lb, &recvtype_true_extent); recvbuf_extent = recvcount * comm_size * (MPL_MAX(recvtype_true_extent, recvtype_extent)); MPIR_CHKLMEM_MALLOC(tmp_buf, void *, recvbuf_extent, mpi_errno, "tmp_buf"); /* adjust for potential negative lower bound in datatype */ tmp_buf = (void *)((char*)tmp_buf - recvtype_true_lb); mpi_errno = MPIR_Localcopy((char *) recvbuf + (rank+1)*recvcount*recvtype_extent, (comm_size - rank - 1)*recvcount, recvtype, tmp_buf, (comm_size - rank - 1)*recvcount, recvtype); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } mpi_errno = MPIR_Localcopy(recvbuf, (rank+1)*recvcount, recvtype, (char *) tmp_buf + (comm_size-rank-1)*recvcount*recvtype_extent, (rank+1)*recvcount, recvtype); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } /* Blocks are in the reverse order now (comm_size-1 to 0). * Reorder them to (0 to comm_size-1) and store them in recvbuf. */ for (i=0; i<comm_size; i++){ mpi_errno = MPIR_Localcopy((char *) tmp_buf + i*recvcount*recvtype_extent, recvcount, recvtype, (char *) recvbuf + (comm_size-i-1)*recvcount*recvtype_extent, recvcount, recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } }
int MPIR_Igather_sched_intra_binomial(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank; int relative_rank, is_homogeneous; int mask, src, dst, relative_src; MPI_Aint recvtype_size, sendtype_size, curr_cnt=0, nbytes; int recvblks; int tmp_buf_size, missing; void *tmp_buf = NULL; int blocks[2]; int displs[2]; MPI_Aint struct_displs[2]; MPI_Aint extent=0; int copy_offset = 0, copy_blks = 0; MPI_Datatype types[2], tmp_type; MPIR_SCHED_CHKPMEM_DECL(1); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; if (((rank == root) && (recvcount == 0)) || ((rank != root) && (sendcount == 0))) goto fn_exit; is_homogeneous = TRUE; #ifdef MPID_HAS_HETERO is_homogeneous = !comm_ptr->is_hetero; #endif MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); /* Use binomial tree algorithm. */ relative_rank = (rank >= root) ? rank - root : rank - root + comm_size; if (rank == root) { MPIR_Datatype_get_extent_macro(recvtype, extent); MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf+ (extent*recvcount*comm_size)); } if (is_homogeneous) { /* communicator is homogeneous. no need to pack buffer. */ if (rank == root) { MPIR_Datatype_get_size_macro(recvtype, recvtype_size); nbytes = recvtype_size * recvcount; } else { MPIR_Datatype_get_size_macro(sendtype, sendtype_size); nbytes = sendtype_size * sendcount; } /* Find the number of missing nodes in my sub-tree compared to * a balanced tree */ for (mask = 1; mask < comm_size; mask <<= 1); --mask; while (relative_rank & mask) mask >>= 1; missing = (relative_rank | mask) - comm_size + 1; if (missing < 0) missing = 0; tmp_buf_size = (mask - missing); /* If the message is smaller than the threshold, we will copy * our message in there too */ if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) tmp_buf_size++; tmp_buf_size *= nbytes; /* For zero-ranked root, we don't need any temporary buffer */ if ((rank == root) && (!root || (nbytes >= MPIR_CVAR_GATHER_VSMALL_MSG_SIZE))) tmp_buf_size = 0; if (tmp_buf_size) { MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, tmp_buf_size, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); } if (rank == root) { if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, ((char *) recvbuf + extent*recvcount*rank), recvcount, recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else if (tmp_buf_size && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE)) { /* copy from sendbuf into tmp_buf */ mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, tmp_buf, nbytes, MPI_BYTE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } curr_cnt = nbytes; mask = 0x1; while (mask < comm_size) { if ((mask & relative_rank) == 0) { src = relative_rank | mask; if (src < comm_size) { src = (src + root) % comm_size; if (rank == root) { recvblks = mask; if ((2 * recvblks) > comm_size) recvblks = comm_size - recvblks; if ((rank + mask + recvblks == comm_size) || (((rank + mask) % comm_size) < ((rank + mask + recvblks) % comm_size))) { /* If the data contiguously fits into the * receive buffer, place it directly. This * should cover the case where the root is * rank 0. */ char *rp = (char *)recvbuf + (((rank + mask) % comm_size)*recvcount*extent); mpi_errno = MPIR_Sched_recv(rp, (recvblks * recvcount), recvtype, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_recv(tmp_buf, (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); copy_offset = rank + mask; copy_blks = recvblks; } else { blocks[0] = recvcount * (comm_size - root - mask); displs[0] = recvcount * (root + mask); blocks[1] = (recvcount * recvblks) - blocks[0]; displs[1] = 0; mpi_errno = MPIR_Type_indexed_impl(2, blocks, displs, recvtype, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(recvbuf, 1, tmp_type, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } } else { /* Intermediate nodes store in temporary buffer */ MPI_Aint offset; /* Estimate the amount of data that is going to come in */ recvblks = mask; relative_src = ((src - root) < 0) ? (src - root + comm_size) : (src - root); if (relative_src + mask > comm_size) recvblks -= (relative_src + mask - comm_size); if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) offset = mask * nbytes; else offset = (mask - 1) * nbytes; mpi_errno = MPIR_Sched_recv(((char *)tmp_buf + offset), (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); curr_cnt += (recvblks * nbytes); } } } else { dst = relative_rank ^ mask; dst = (dst + root) % comm_size; if (!tmp_buf_size) { /* leaf nodes send directly from sendbuf */ mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_send(tmp_buf, curr_cnt, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { blocks[0] = sendcount; struct_displs[0] = MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf; types[0] = sendtype; /* check for overflow. work around int limits if needed*/ if (curr_cnt - nbytes != (int)(curr_cnt-nbytes)) { blocks[1] = 1; MPIR_Type_contiguous_x_impl(curr_cnt - nbytes, MPI_BYTE, &(types[1])); } else { MPIR_Assign_trunc(blocks[1], curr_cnt - nbytes, int); types[1] = MPI_BYTE; } struct_displs[1] = MPIR_VOID_PTR_CAST_TO_MPI_AINT tmp_buf; mpi_errno = MPIR_Type_create_struct_impl(2, blocks, struct_displs, types, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_send(MPI_BOTTOM, 1, tmp_type, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } break; } mask <<= 1; } if ((rank == root) && root && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) && copy_blks) { /* reorder and copy from tmp_buf into recvbuf */ /* FIXME why are there two copies here? */ mpi_errno = MPIR_Sched_copy(tmp_buf, nbytes * (comm_size - copy_offset), MPI_BYTE, ((char *)recvbuf + extent * recvcount * copy_offset), recvcount * (comm_size - copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_copy((char *)tmp_buf + nbytes * (comm_size - copy_offset), nbytes * (copy_blks - comm_size + copy_offset), MPI_BYTE, recvbuf, recvcount * (copy_blks - comm_size + copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } #ifdef MPID_HAS_HETERO else {
/*@ MPI_Type_commit - Commits the datatype Input Parameters: . datatype - datatype (handle) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TYPE @*/ int MPI_Type_commit(MPI_Datatype *datatype) { int mpi_errno = MPI_SUCCESS; MPID_Datatype *datatype_ptr = NULL; MPID_MPI_STATE_DECL(MPID_STATE_MPI_TYPE_COMMIT); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPIU_THREAD_CS_ENTER(ALLFUNC,); MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_TYPE_COMMIT); /* Validate parameters, especially handles needing to be converted */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_ARGNULL(datatype, "datatype", mpi_errno); MPIR_ERRTEST_DATATYPE(*datatype, "datatype", mpi_errno); } MPID_END_ERROR_CHECKS; } # endif /* Convert MPI object handles to object pointers */ MPID_Datatype_get_ptr( *datatype, datatype_ptr ); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate datatype_ptr */ MPID_Datatype_valid_ptr(datatype_ptr, mpi_errno); if (mpi_errno) goto fn_fail; } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Type_commit_impl(datatype); if (mpi_errno) MPIU_ERR_POP(mpi_errno); /* ... end of body of routine ... */ fn_exit: MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_TYPE_COMMIT); MPIU_THREAD_CS_EXIT(ALLFUNC,); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_type_commit", "**mpi_type_commit %p", datatype); } # endif mpi_errno = MPIR_Err_return_comm( NULL, FCNAME, mpi_errno ); goto fn_exit; /* --END ERROR HANDLING-- */ }