int MPIR_Ireduce_scatter_sched_intra_recursive_doubling(const void *sendbuf, void *recvbuf, const int recvcounts[], MPI_Datatype datatype, MPI_Op op, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int rank, comm_size, i; MPI_Aint extent, true_extent, true_lb; int *disps; void *tmp_recvbuf, *tmp_results; int type_size ATTRIBUTE((unused)), dis[2], blklens[2], total_count, dst; int mask, dst_tree_root, my_tree_root, j, k; int received; MPI_Datatype sendtype, recvtype; int nprocs_completed, tmp_mask, tree_root, is_commutative; MPIR_SCHED_CHKPMEM_DECL(5); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; MPIR_Datatype_get_extent_macro(datatype, extent); MPIR_Type_get_true_extent_impl(datatype, &true_lb, &true_extent); is_commutative = MPIR_Op_is_commutative(op); MPIR_SCHED_CHKPMEM_MALLOC(disps, int *, comm_size * sizeof(int), mpi_errno, "disps", MPL_MEM_BUFFER); total_count = 0; for (i=0; i<comm_size; i++) { disps[i] = total_count; total_count += recvcounts[i]; } if (total_count == 0) { goto fn_exit; } MPIR_Datatype_get_size_macro(datatype, type_size); /* total_count*extent eventually gets malloced. it isn't added to * a user-passed in buffer */ MPIR_Ensure_Aint_fits_in_pointer(total_count * MPL_MAX(true_extent, extent)); /* need to allocate temporary buffer to receive incoming data*/ MPIR_SCHED_CHKPMEM_MALLOC(tmp_recvbuf, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_recvbuf", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_recvbuf = (void *)((char*)tmp_recvbuf - true_lb); /* need to allocate another temporary buffer to accumulate results */ MPIR_SCHED_CHKPMEM_MALLOC(tmp_results, void *, total_count*(MPL_MAX(true_extent,extent)), mpi_errno, "tmp_results", MPL_MEM_BUFFER); /* adjust for potential negative lower bound in datatype */ tmp_results = (void *)((char*)tmp_results - true_lb); /* copy sendbuf into tmp_results */ if (sendbuf != MPI_IN_PLACE) mpi_errno = MPIR_Sched_copy(sendbuf, total_count, datatype, tmp_results, total_count, datatype, s); else mpi_errno = MPIR_Sched_copy(recvbuf, total_count, datatype, tmp_results, total_count, datatype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); mask = 0x1; i = 0; while (mask < comm_size) { dst = rank ^ mask; dst_tree_root = dst >> i; dst_tree_root <<= i; my_tree_root = rank >> i; my_tree_root <<= i; /* At step 1, processes exchange (n-n/p) amount of data; at step 2, (n-2n/p) amount of data; at step 3, (n-4n/p) amount of data, and so forth. We use derived datatypes for this. At each step, a process does not need to send data indexed from my_tree_root to my_tree_root+mask-1. Similarly, a process won't receive data indexed from dst_tree_root to dst_tree_root+mask-1. */ /* calculate sendtype */ blklens[0] = blklens[1] = 0; for (j=0; j<my_tree_root; j++) blklens[0] += recvcounts[j]; for (j=my_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=my_tree_root; (j<my_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&sendtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* calculate recvtype */ blklens[0] = blklens[1] = 0; for (j=0; j<dst_tree_root && j<comm_size; j++) blklens[0] += recvcounts[j]; for (j=dst_tree_root+mask; j<comm_size; j++) blklens[1] += recvcounts[j]; dis[0] = 0; dis[1] = blklens[0]; for (j=dst_tree_root; (j<dst_tree_root+mask) && (j<comm_size); j++) dis[1] += recvcounts[j]; mpi_errno = MPIR_Type_indexed_impl(2, blklens, dis, datatype, &recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); received = 0; if (dst < comm_size) { /* tmp_results contains data to be sent in each step. Data is received in tmp_recvbuf and then accumulated into tmp_results. accumulation is done later below. */ mpi_errno = MPIR_Sched_send(tmp_results, 1, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } /* if some processes in this process's subtree in this step did not have any destination process to communicate with because of non-power-of-two, we need to send them the result. We use a logarithmic recursive-halfing algorithm for this. */ if (dst_tree_root + mask > comm_size) { nprocs_completed = comm_size - my_tree_root - mask; /* nprocs_completed is the number of processes in this subtree that have all the data. Send data to others in a tree fashion. First find root of current tree that is being divided into two. k is the number of least-significant bits in this process's rank that must be zeroed out to find the rank of the root */ j = mask; k = 0; while (j) { j >>= 1; k++; } k--; tmp_mask = mask >> 1; while (tmp_mask) { dst = rank ^ tmp_mask; tree_root = rank >> k; tree_root <<= k; /* send only if this proc has data and destination doesn't have data. at any step, multiple processes can send if they have the data */ if ((dst > rank) && (rank < tree_root + nprocs_completed) && (dst >= tree_root + nprocs_completed)) { /* send the current result */ mpi_errno = MPIR_Sched_send(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } /* recv only if this proc. doesn't have data and sender has data */ else if ((dst < rank) && (dst < tree_root + nprocs_completed) && (rank >= tree_root + nprocs_completed)) { mpi_errno = MPIR_Sched_recv(tmp_recvbuf, 1, recvtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); received = 1; } tmp_mask >>= 1; k--; } } /* N.B. The following comment comes from the FT version of * MPI_Reduce_scatter. It does not currently apply to this code, but * will in the future when we update the NBC code to be fault-tolerant * in roughly the same fashion. [goodell@ 2011-03-03] */ /* The following reduction is done here instead of after the MPIC_Sendrecv or MPIC_Recv above. This is because to do it above, in the noncommutative case, we would need an extra temp buffer so as not to overwrite temp_recvbuf, because temp_recvbuf may have to be communicated to other processes in the non-power-of-two case. To avoid that extra allocation, we do the reduce here. */ if (received) { if (is_commutative || (dst_tree_root < my_tree_root)) { mpi_errno = MPIR_Sched_reduce(tmp_recvbuf, tmp_results, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_recvbuf + dis[1]*extent), ((char *)tmp_results + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } else { mpi_errno = MPIR_Sched_reduce(tmp_results, tmp_recvbuf, blklens[0], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_reduce(((char *)tmp_results + dis[1]*extent), ((char *)tmp_recvbuf + dis[1]*extent), blklens[1], datatype, op, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* copy result back into tmp_results */ mpi_errno = MPIR_Sched_copy(tmp_recvbuf, 1, recvtype, tmp_results, 1, recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); } } MPIR_Type_free_impl(&sendtype); MPIR_Type_free_impl(&recvtype); mask <<= 1; i++; }
/*@ MPI_Type_indexed - Creates an indexed datatype Input Parameters: + count - number of blocks -- also number of entries in array_of_displacements and array_of_blocklengths . array_of_blocklengths - number of elements in each block (array of nonnegative integers) . array_of_displacements - displacement of each block in multiples of oldtype (array of integers) - oldtype - old datatype (handle) Output Parameters: . newtype - new datatype (handle) .N ThreadSafe .N Fortran The array_of_displacements are displacements, and are based on a zero origin. A common error is to do something like to following .vb integer a(100) integer array_of_blocklengths(10), array_of_displacements(10) do i=1,10 array_of_blocklengths(i) = 1 10 array_of_displacements(i) = 1 + (i-1)*10 call MPI_TYPE_INDEXED(10,array_of_blocklengths,array_of_displacements,MPI_INTEGER,newtype,ierr) call MPI_TYPE_COMMIT(newtype,ierr) call MPI_SEND(a,1,newtype,...) .ve expecting this to send "a(1),a(11),..." because the array_of_displacements have values "1,11,...". Because these are `displacements` from the beginning of "a", it actually sends "a(1+1),a(1+11),...". If you wish to consider the displacements as array_of_displacements into a Fortran array, consider declaring the Fortran array with a zero origin .vb integer a(0:99) .ve .N Errors .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_ARG .N MPI_ERR_EXHAUSTED @*/ int MPI_Type_indexed(int count, const int *array_of_blocklengths, const int *array_of_displacements, MPI_Datatype oldtype, MPI_Datatype *newtype) { int mpi_errno = MPI_SUCCESS; MPID_MPI_STATE_DECL(MPID_STATE_MPI_TYPE_INDEXED); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPIU_THREAD_CS_ENTER(ALLFUNC,); MPID_MPI_FUNC_ENTER(MPID_STATE_MPI_TYPE_INDEXED); /* Validate parameters and objects (post conversion) */ # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { int j; MPID_Datatype *datatype_ptr = NULL; MPIR_ERRTEST_COUNT(count,mpi_errno); if (count > 0) { MPIR_ERRTEST_ARGNULL(array_of_blocklengths, "array_of_blocklengths", mpi_errno); MPIR_ERRTEST_ARGNULL(array_of_displacements, "array_of_displacements", mpi_errno); } MPIR_ERRTEST_DATATYPE(oldtype, "datatype", mpi_errno); if (HANDLE_GET_KIND(oldtype) != HANDLE_KIND_BUILTIN) { MPID_Datatype_get_ptr( oldtype, datatype_ptr ); MPID_Datatype_valid_ptr( datatype_ptr, mpi_errno ); } /* verify that all blocklengths are >= 0 */ for (j=0; j < count; j++) { MPIR_ERRTEST_ARGNEG(array_of_blocklengths[j], "blocklength", mpi_errno); } MPIR_ERRTEST_ARGNULL(newtype, "newtype", mpi_errno); } MPID_END_ERROR_CHECKS; } # endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ mpi_errno = MPIR_Type_indexed_impl(count, array_of_blocklengths, array_of_displacements, oldtype, newtype); if (mpi_errno) goto fn_fail; /* ... end of body of routine ... */ fn_exit: MPID_MPI_FUNC_EXIT(MPID_STATE_MPI_TYPE_INDEXED); MPIU_THREAD_CS_EXIT(ALLFUNC,); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_type_indexed", "**mpi_type_indexed %d %p %p %D %p", count,array_of_blocklengths, array_of_displacements, oldtype, newtype); } # endif mpi_errno = MPIR_Err_return_comm( NULL, FCNAME, mpi_errno ); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIR_Igather_sched_intra_binomial(const void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, int root, MPIR_Comm *comm_ptr, MPIR_Sched_t s) { int mpi_errno = MPI_SUCCESS; int comm_size, rank; int relative_rank, is_homogeneous; int mask, src, dst, relative_src; MPI_Aint recvtype_size, sendtype_size, curr_cnt=0, nbytes; int recvblks; int tmp_buf_size, missing; void *tmp_buf = NULL; int blocks[2]; int displs[2]; MPI_Aint struct_displs[2]; MPI_Aint extent=0; int copy_offset = 0, copy_blks = 0; MPI_Datatype types[2], tmp_type; MPIR_SCHED_CHKPMEM_DECL(1); comm_size = comm_ptr->local_size; rank = comm_ptr->rank; if (((rank == root) && (recvcount == 0)) || ((rank != root) && (sendcount == 0))) goto fn_exit; is_homogeneous = TRUE; #ifdef MPID_HAS_HETERO is_homogeneous = !comm_ptr->is_hetero; #endif MPIR_Assert(comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM); /* Use binomial tree algorithm. */ relative_rank = (rank >= root) ? rank - root : rank - root + comm_size; if (rank == root) { MPIR_Datatype_get_extent_macro(recvtype, extent); MPIR_Ensure_Aint_fits_in_pointer(MPIR_VOID_PTR_CAST_TO_MPI_AINT recvbuf+ (extent*recvcount*comm_size)); } if (is_homogeneous) { /* communicator is homogeneous. no need to pack buffer. */ if (rank == root) { MPIR_Datatype_get_size_macro(recvtype, recvtype_size); nbytes = recvtype_size * recvcount; } else { MPIR_Datatype_get_size_macro(sendtype, sendtype_size); nbytes = sendtype_size * sendcount; } /* Find the number of missing nodes in my sub-tree compared to * a balanced tree */ for (mask = 1; mask < comm_size; mask <<= 1); --mask; while (relative_rank & mask) mask >>= 1; missing = (relative_rank | mask) - comm_size + 1; if (missing < 0) missing = 0; tmp_buf_size = (mask - missing); /* If the message is smaller than the threshold, we will copy * our message in there too */ if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) tmp_buf_size++; tmp_buf_size *= nbytes; /* For zero-ranked root, we don't need any temporary buffer */ if ((rank == root) && (!root || (nbytes >= MPIR_CVAR_GATHER_VSMALL_MSG_SIZE))) tmp_buf_size = 0; if (tmp_buf_size) { MPIR_SCHED_CHKPMEM_MALLOC(tmp_buf, void *, tmp_buf_size, mpi_errno, "tmp_buf", MPL_MEM_BUFFER); } if (rank == root) { if (sendbuf != MPI_IN_PLACE) { mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, ((char *) recvbuf + extent*recvcount*rank), recvcount, recvtype); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } else if (tmp_buf_size && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE)) { /* copy from sendbuf into tmp_buf */ mpi_errno = MPIR_Localcopy(sendbuf, sendcount, sendtype, tmp_buf, nbytes, MPI_BYTE); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } curr_cnt = nbytes; mask = 0x1; while (mask < comm_size) { if ((mask & relative_rank) == 0) { src = relative_rank | mask; if (src < comm_size) { src = (src + root) % comm_size; if (rank == root) { recvblks = mask; if ((2 * recvblks) > comm_size) recvblks = comm_size - recvblks; if ((rank + mask + recvblks == comm_size) || (((rank + mask) % comm_size) < ((rank + mask + recvblks) % comm_size))) { /* If the data contiguously fits into the * receive buffer, place it directly. This * should cover the case where the root is * rank 0. */ char *rp = (char *)recvbuf + (((rank + mask) % comm_size)*recvcount*extent); mpi_errno = MPIR_Sched_recv(rp, (recvblks * recvcount), recvtype, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_recv(tmp_buf, (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); copy_offset = rank + mask; copy_blks = recvblks; } else { blocks[0] = recvcount * (comm_size - root - mask); displs[0] = recvcount * (root + mask); blocks[1] = (recvcount * recvblks) - blocks[0]; displs[1] = 0; mpi_errno = MPIR_Type_indexed_impl(2, blocks, displs, recvtype, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_recv(recvbuf, 1, tmp_type, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } } else { /* Intermediate nodes store in temporary buffer */ MPI_Aint offset; /* Estimate the amount of data that is going to come in */ recvblks = mask; relative_src = ((src - root) < 0) ? (src - root + comm_size) : (src - root); if (relative_src + mask > comm_size) recvblks -= (relative_src + mask - comm_size); if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) offset = mask * nbytes; else offset = (mask - 1) * nbytes; mpi_errno = MPIR_Sched_recv(((char *)tmp_buf + offset), (recvblks * nbytes), MPI_BYTE, src, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); curr_cnt += (recvblks * nbytes); } } } else { dst = relative_rank ^ mask; dst = (dst + root) % comm_size; if (!tmp_buf_size) { /* leaf nodes send directly from sendbuf */ mpi_errno = MPIR_Sched_send(sendbuf, sendcount, sendtype, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else if (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) { mpi_errno = MPIR_Sched_send(tmp_buf, curr_cnt, MPI_BYTE, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_barrier(s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { blocks[0] = sendcount; struct_displs[0] = MPIR_VOID_PTR_CAST_TO_MPI_AINT sendbuf; types[0] = sendtype; /* check for overflow. work around int limits if needed*/ if (curr_cnt - nbytes != (int)(curr_cnt-nbytes)) { blocks[1] = 1; MPIR_Type_contiguous_x_impl(curr_cnt - nbytes, MPI_BYTE, &(types[1])); } else { MPIR_Assign_trunc(blocks[1], curr_cnt - nbytes, int); types[1] = MPI_BYTE; } struct_displs[1] = MPIR_VOID_PTR_CAST_TO_MPI_AINT tmp_buf; mpi_errno = MPIR_Type_create_struct_impl(2, blocks, struct_displs, types, &tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Type_commit_impl(&tmp_type); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_send(MPI_BOTTOM, 1, tmp_type, dst, comm_ptr, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); /* this "premature" free is safe b/c the sched holds an actual ref to keep it alive */ MPIR_Type_free_impl(&tmp_type); } break; } mask <<= 1; } if ((rank == root) && root && (nbytes < MPIR_CVAR_GATHER_VSMALL_MSG_SIZE) && copy_blks) { /* reorder and copy from tmp_buf into recvbuf */ /* FIXME why are there two copies here? */ mpi_errno = MPIR_Sched_copy(tmp_buf, nbytes * (comm_size - copy_offset), MPI_BYTE, ((char *)recvbuf + extent * recvcount * copy_offset), recvcount * (comm_size - copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Sched_copy((char *)tmp_buf + nbytes * (comm_size - copy_offset), nbytes * (copy_blks - comm_size + copy_offset), MPI_BYTE, recvbuf, recvcount * (copy_blks - comm_size + copy_offset), recvtype, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } #ifdef MPID_HAS_HETERO else {