/*@ MPI_Reduce_scatter - Combines values and scatters the results Input Parameters: + sendbuf - starting address of send buffer (choice) . recvcounts - integer array specifying the number of elements in result distributed to each process. Array must be identical on all calling processes. . datatype - data type of elements of input buffer (handle) . op - operation (handle) - comm - communicator (handle) Output Parameter: . recvbuf - starting address of receive buffer (choice) .N fortran .N collops .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_BUFFER .N MPI_ERR_OP .N MPI_ERR_BUFFER_ALIAS @*/ EXPORT_MPI_API int MPI_Reduce_scatter ( void *sendbuf, void *recvbuf, int *recvcnts, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr; struct MPIR_DATATYPE *dtype_ptr; static char myname[] = "MPI_REDUCE_SCATTER"; MPIR_ERROR_DECL; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); dtype_ptr = MPIR_GET_DTYPE_PTR(datatype); /* Check for invalid arguments */ #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_DTYPE(datatype,dtype_ptr,comm_ptr,myname); MPIR_TEST_ALIAS(recvbuf,sendbuf); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif MPIR_ERROR_PUSH(comm_ptr); mpi_errno = comm_ptr->collops->Reduce_scatter(sendbuf, recvbuf, recvcnts, dtype_ptr, op, comm_ptr ); MPIR_ERROR_POP(comm_ptr); TR_POP; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Sendrecv - Sends and receives a message Input Parameters: + sendbuf - initial address of send buffer (choice) . sendcount - number of elements in send buffer (integer) . sendtype - type of elements in send buffer (handle) . dest - rank of destination (integer) . sendtag - send tag (integer) . recvcount - number of elements in receive buffer (integer) . recvtype - type of elements in receive buffer (handle) . source - rank of source (integer) . recvtag - receive tag (integer) - comm - communicator (handle) Output Parameters: + recvbuf - initial address of receive buffer (choice) - status - status object (Status). This refers to the receive operation. .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_TAG .N MPI_ERR_RANK @*/ EXPORT_MPI_API int MPI_Sendrecv( void *sendbuf, int sendcount, MPI_Datatype sendtype, int dest, int sendtag, void *recvbuf, int recvcount, MPI_Datatype recvtype, int source, int recvtag, MPI_Comm comm, MPI_Status *status ) { MPI_Status __status; int mpi_errno = MPI_SUCCESS; MPI_Status status_array[2]; MPI_Request req[2]; MPIR_ERROR_DECL; struct MPIR_COMMUNICATOR *comm_ptr; static char myname[] = "MPI_SENDRECV"; if(status == MPI_STATUS_IGNORE) status = &__status; /* Let the Isend/Irecv check arguments */ /* Comments on this: We can probably do an Irecv/Send/Wait on Irecv (blocking send) but what we really like to do is "send if odd, recv if even, followed by send if even, recv if odd". We can't do that, because we don't require that these match up in any particular way (that is, there is no way to assert the "parity" of the partners). Note that the IBM "mp_bsendrecv" DOES require that only mp_bsendrecv be used. Should there be a send/recv bit in the send mode? Note that in this implementation, if the error handler is "return", these will return the error to the caller. If the handler causes an abort or message, then that will occur in the called routine. Thus, this code need not call the error handler AGAIN. */ comm_ptr = MPIR_GET_COMM_PTR(comm); MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_ERROR_PUSH(comm_ptr); MPIR_CALL_POP(MPI_Irecv ( recvbuf, recvcount, recvtype, source, recvtag, comm, &req[1] ),comm_ptr,myname); MPIR_CALL_POP(MPI_Isend ( sendbuf, sendcount, sendtype, dest, sendtag, comm, &req[0] ),comm_ptr,myname); /* FPRINTF( stderr, "[%d] Starting waitall\n", MPIR_tid );*/ mpi_errno = MPI_Waitall( 2, req, status_array ); /* We don't use MPIR_CALL_POP because we want to convert error in status to the direct error */ /* MPIR_CALL_POP(MPI_Waitall ( 2, req, status_array ),comm_ptr,myname); */ MPIR_ERROR_POP(comm_ptr); /*FPRINTF( stderr, "[%d] Ending waitall\n", MPIR_tid );*/ if (mpi_errno == MPI_ERR_IN_STATUS) { if (status_array[0].MPI_ERROR) mpi_errno = status_array[0].MPI_ERROR; if (status_array[1].MPI_ERROR) mpi_errno = status_array[1].MPI_ERROR; } (*status) = status_array[1]; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Type_indexed - Creates an indexed datatype Input Parameters: + count - number of blocks -- also number of entries in indices and blocklens . blocklens - number of elements in each block (array of nonnegative integers) . indices - displacement of each block in multiples of old_type (array of integers) - old_type - old datatype (handle) Output Parameter: . newtype - new datatype (handle) .N fortran The indices are displacements, and are based on a zero origin. A common error is to do something like to following .vb integer a(100) integer blens(10), indices(10) do i=1,10 blens(i) = 1 10 indices(i) = 1 + (i-1)*10 call MPI_TYPE_INDEXED(10,blens,indices,MPI_INTEGER,newtype,ierr) call MPI_TYPE_COMMIT(newtype,ierr) call MPI_SEND(a,1,newtype,...) .ve expecting this to send 'a(1),a(11),...' because the indices have values '1,11,...'. Because these are `displacements` from the beginning of 'a', it actually sends 'a(1+1),a(1+11),...'. If you wish to consider the displacements as indices into a Fortran array, consider declaring the Fortran array with a zero origin .vb integer a(0:99) .ve .N Errors .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_ARG .N MPI_ERR_EXHAUSTED @*/ EXPORT_MPI_API int MPI_Type_indexed( int count, int blocklens[], int indices[], MPI_Datatype old_type, MPI_Datatype *newtype ) { MPI_Aint *hindices; int i, mpi_errno = MPI_SUCCESS; int total_count; struct MPIR_DATATYPE *old_dtype_ptr; static char myname[] = "MPI_TYPE_INDEXED"; MPIR_ERROR_DECL; TR_PUSH(myname); /* Check for bad arguments */ old_dtype_ptr = MPIR_GET_DTYPE_PTR(old_type); MPIR_TEST_DTYPE(old_type,old_dtype_ptr,MPIR_COMM_WORLD,myname); if ( ( (count < 0) && (mpi_errno = MPI_ERR_COUNT) ) || ( (old_dtype_ptr->dte_type == MPIR_UB) && (mpi_errno = MPI_ERR_TYPE) ) || ( (old_dtype_ptr->dte_type == MPIR_LB) && (mpi_errno = MPI_ERR_TYPE) ) ) return MPIR_ERROR( MPIR_COMM_WORLD, mpi_errno,myname); /* Are we making a null datatype? */ total_count = 0; for (i=0; i<count; i++) { total_count += blocklens[i]; if (blocklens[i] < 0) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_ARG, MPIR_ERR_ARG_ARRAY_VAL, myname, (char *)0, (char *)0, "blocklens", i, blocklens[i] ); return MPIR_ERROR( MPIR_COMM_WORLD, mpi_errno,myname); } } if (total_count == 0) { return MPI_Type_contiguous( 0, MPI_INT, newtype ); } /* Generate a call to MPI_Type_hindexed instead. This means allocating a temporary displacement array, multiplying all displacements by extent(old_type), and using that */ MPIR_ALLOC(hindices,(MPI_Aint *)MALLOC(count*sizeof(MPI_Aint)), MPIR_COMM_WORLD,MPI_ERR_EXHAUSTED,myname); for (i=0; i<count; i++) { hindices[i] = (MPI_Aint)indices[i] * old_dtype_ptr->extent; } MPIR_ERROR_PUSH(MPIR_COMM_WORLD); mpi_errno = MPI_Type_hindexed( count, blocklens, hindices, old_type, newtype ); MPIR_ERROR_POP(MPIR_COMM_WORLD); FREE(hindices); TR_POP; MPIR_RETURN(MPIR_COMM_WORLD,mpi_errno, myname); }
/*@ MPI_Allreduce - Combines values from all processes and distribute the result back to all processes Input Parameters: + sendbuf - starting address of send buffer (choice) . count - number of elements in send buffer (integer) . datatype - data type of elements of send buffer (handle) . op - operation (handle) - comm - communicator (handle) Output Parameter: . recvbuf - starting address of receive buffer (choice) .N fortran .N collops .N Errors .N MPI_ERR_BUFFER .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_OP .N MPI_ERR_COMM @*/ EXPORT_MPI_API int MPI_Allreduce ( void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr; struct MPIR_DATATYPE *dtype_ptr; MPIR_ERROR_DECL; static char myname[] = "MPI_ALLREDUCE"; #ifdef RED_DEBUG char zahl[10]; static int callcount=0; callcount++; DBG("Entering Allreduce()"); DBG(itoa(callcount,zahl,10)); #endif TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); dtype_ptr = MPIR_GET_DTYPE_PTR(datatype); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_DTYPE(datatype,dtype_ptr,comm_ptr,myname); MPIR_TEST_COUNT(count); MPIR_TEST_ALIAS(sendbuf,recvbuf); if (mpi_errno) { #ifdef RED_DEBUG DBG("Leaving Allreduce Error No"); DBG(itoa(mpi_errno,zahl,10)); #endif return MPIR_ERROR(comm_ptr, mpi_errno, myname ); } #endif MPIR_ERROR_PUSH(comm_ptr); /* Test for intercommunicator is done when collops is assigned */ mpi_errno = comm_ptr->collops->Allreduce(sendbuf, recvbuf, count, dtype_ptr, op, comm_ptr ); MPIR_ERROR_POP(comm_ptr); TR_POP; #ifdef RED_DEBUG DBG("Leaving Allreduce"); #endif MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Graph_get - Retrieves graph topology information associated with a communicator Input Parameters: + comm - communicator with graph structure (handle) . maxindex - length of vector 'index' in the calling program (integer) - maxedges - length of vector 'edges' in the calling program (integer) Output Parameter: + index - array of integers containing the graph structure (for details see the definition of 'MPI_GRAPH_CREATE') - edges - array of integers containing the graph structure .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TOPOLOGY .N MPI_ERR_COMM .N MPI_ERR_ARG @*/ int MPI_Graph_get ( MPI_Comm comm, int maxindex, int maxedges, int *index, int *edges ) { int i, num, flag; int *array; int mpi_errno = MPI_SUCCESS; MPIR_TOPOLOGY *topo; struct MPIR_COMMUNICATOR *comm_ptr; static char myname[] = "MPI_GRAPH_GET"; MPIR_ERROR_DECL; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_ARG(index); MPIR_TEST_ARG(edges); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif /* Get topology information from the communicator */ MPIR_ERROR_PUSH(comm_ptr); mpi_errno = MPI_Attr_get ( comm, MPIR_TOPOLOGY_KEYVAL, (void **)&topo, &flag ); MPIR_ERROR_POP(comm_ptr); if ( ( (flag != 1) && (mpi_errno = MPI_ERR_TOPOLOGY) ) || ( (topo->type != MPI_GRAPH) && (mpi_errno = MPI_ERR_TOPOLOGY) ) ) return MPIR_ERROR( comm_ptr, mpi_errno, myname ); /* Get index */ num = topo->graph.nnodes; array = topo->graph.index; if ( index != (int *)0 ) for ( i=0; (i<maxindex) && (i<num); i++ ) (*index++) = (*array++); /* Get edges */ num = topo->graph.nedges; array = topo->graph.edges; if ( edges != (int *)0 ) for ( i=0; (i<maxedges) && (i<num); i++ ) (*edges++) = (*array++); TR_POP; return (mpi_errno); }
/*@ MPI_Graphdims_get - Retrieves graph topology information associated with a communicator Input Parameters: . comm - communicator for group with graph structure (handle) Output Parameter: + nnodes - number of nodes in graph (integer) - nedges - number of edges in graph (integer) .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_TOPOLOGY .N MPI_ERR_COMM .N MPI_ERR_ARG @*/ EXPORT_MPI_API int MPI_Graphdims_get ( MPI_Comm comm, int *nnodes, int *nedges ) { int mpi_errno = MPI_SUCCESS, flag; MPIR_TOPOLOGY *topo; struct MPIR_COMMUNICATOR *comm_ptr; static char myname[] = "MPI_GRAPHDIMS_GET"; MPIR_ERROR_DECL; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_ARG(nnodes); MPIR_TEST_ARG(nedges); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif /* Get topology information from the communicator */ MPIR_ERROR_PUSH( comm_ptr ); mpi_errno = MPI_Attr_get ( comm, MPIR_TOPOLOGY_KEYVAL, (void **)&topo, &flag ); MPIR_ERROR_POP( comm_ptr ); if (mpi_errno) { return MPIR_ERROR( comm_ptr, mpi_errno, myname ); } /* Set nnodes */ if ( nnodes != (int *)0 ) if ( (flag == 1) && (topo->type == MPI_GRAPH) ) (*nnodes) = topo->graph.nnodes; else (*nnodes) = MPI_UNDEFINED; /* Set nedges */ if ( nedges != (int *)0 ) if ( (flag == 1) && (topo->type == MPI_GRAPH) ) (*nedges) = topo->graph.nedges; else (*nedges) = MPI_UNDEFINED; TR_POP; return (MPI_SUCCESS); }
/*@ MPI_Gatherv - Gathers into specified locations from all processes in a group Input Parameters: + sendbuf - starting address of send buffer (choice) . sendcount - number of elements in send buffer (integer) . sendtype - data type of send buffer elements (handle) . recvcounts - integer array (of length group size) containing the number of elements that are received from each process (significant only at 'root') . displs - integer array (of length group size). Entry 'i' specifies the displacement relative to recvbuf at which to place the incoming data from process 'i' (significant only at root) . recvtype - data type of recv buffer elements (significant only at 'root') (handle) . root - rank of receiving process (integer) - comm - communicator (handle) Output Parameter: . recvbuf - address of receive buffer (choice, significant only at 'root') .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_TYPE .N MPI_ERR_BUFFER @*/ int MPI_Gatherv ( void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int *recvcnts, int *displs, MPI_Datatype recvtype, int root, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; int rank; struct MPIR_COMMUNICATOR *comm_ptr; struct MPIR_DATATYPE *stype_ptr, *rtype_ptr = 0; MPIR_ERROR_DECL; static char myname[] = "MPI_GATHERV"; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); stype_ptr = MPIR_GET_DTYPE_PTR(sendtype); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr, myname); MPIR_TEST_COUNT(sendcnt); MPIR_TEST_DTYPE(sendtype,stype_ptr,comm_ptr, myname ); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif /* rtype is significant only at root */ (void) MPIR_Comm_rank ( comm_ptr, &rank ); if (rank == root) { rtype_ptr = MPIR_GET_DTYPE_PTR(recvtype); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_DTYPE(recvtype,rtype_ptr,comm_ptr, myname ); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif } MPIR_ERROR_PUSH(comm_ptr); mpi_errno = comm_ptr->collops->Gatherv( sendbuf, sendcnt, stype_ptr, recvbuf, recvcnts, displs, rtype_ptr, root, comm_ptr ); MPIR_ERROR_POP(comm_ptr); TR_POP; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Wait - Waits for an MPI send or receive to complete Input Parameter: . request - request (handle) Output Parameter: . status - status object (Status) .N waitstatus .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_REQUEST .N MPI_ERR_ARG @*/ EXPORT_MPI_API int MPI_Wait ( MPI_Request *request, MPI_Status *status) { MPI_Status __status; int mpi_errno; MPIR_ERROR_DECL; MPIR_ERROR_PUSH(MPIR_COMM_WORLD); if(status == MPI_STATUS_IGNORE) status = &__status; /* We'll let MPI_Waitall catch the errors */ mpi_errno = MPI_Waitall( 1, request, status ); MPIR_ERROR_POP(MPIR_COMM_WORLD); if (mpi_errno == MPI_ERR_IN_STATUS) mpi_errno = status->MPI_ERROR; MPIR_RETURN(MPIR_COMM_WORLD,mpi_errno,"MPI_WAIT"); }
/*@ MPI_Bcast - Broadcasts a message from the process with rank "root" to all other processes of the group. Input/output Parameters: + buffer - starting address of buffer (choice) . count - number of entries in buffer (integer) . datatype - data type of buffer (handle) . root - rank of broadcast root (integer) - comm - communicator (handle) Algorithm: If the underlying device does not take responsibility, this function uses a tree-like algorithm to broadcast the message to blocks of processes. A linear algorithm is then used to broadcast the message from the first process in a block to all other processes. 'MPIR_BCAST_BLOCK_SIZE' determines the size of blocks. If this is set to 1, then this function is equivalent to using a pure tree algorithm. If it is set to the size of the group or greater, it is a pure linear algorithm. The value should be adjusted to determine the most efficient value on different machines. .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_BUFFER .N MPI_ERR_ROOT @*/ EXPORT_MPI_API int MPI_Bcast ( void *buffer, int count, MPI_Datatype datatype, int root, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr; struct MPIR_DATATYPE *dtype_ptr; static char myname[] = "MPI_BCAST"; MPIR_ERROR_DECL; TR_PUSH(myname) comm_ptr = MPIR_GET_COMM_PTR(comm); dtype_ptr = MPIR_GET_DTYPE_PTR(datatype); /* Check for invalid arguments */ #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_DTYPE(datatype,dtype_ptr,comm_ptr,myname); /* If an intercomm broadcast, the root can also be MPI_ROOT or MPI_PROC_NULL */ if (root < MPI_ROOT || root >= comm_ptr->np) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_ROOT, MPIR_ERR_DEFAULT, myname, (char *)0, (char *)0, root ); } MPIR_TEST_COUNT(count); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; MPIR_ERROR_PUSH(comm_ptr); mpi_errno = comm_ptr->collops->Bcast(buffer, count, dtype_ptr, root, comm_ptr); MPIR_ERROR_POP(comm_ptr); TR_POP; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Scatter - Sends data from one task to all other tasks in a group Input Parameters: + sendbuf - address of send buffer (choice, significant only at 'root') . sendcount - number of elements sent to each process (integer, significant only at 'root') . sendtype - data type of send buffer elements (significant only at 'root') (handle) . recvcount - number of elements in receive buffer (integer) . recvtype - data type of receive buffer elements (handle) . root - rank of sending process (integer) - comm - communicator (handle) Output Parameter: . recvbuf - address of receive buffer (choice) .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_BUFFER @*/ int MPI_Scatter ( void *sendbuf, int sendcnt, MPI_Datatype sendtype, void *recvbuf, int recvcnt, MPI_Datatype recvtype, int root, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr; struct MPIR_DATATYPE *stype_ptr = NULL; struct MPIR_DATATYPE *rtype_ptr = NULL; static char myname[] = "MPI_SCATTER"; MPIR_ERROR_DECL; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); /* Significant only at root */ if (root == comm_ptr->local_rank) { stype_ptr = MPIR_GET_DTYPE_PTR(sendtype); MPIR_TEST_DTYPE(sendtype,stype_ptr,comm_ptr,myname); } rtype_ptr = MPIR_GET_DTYPE_PTR(recvtype); MPIR_TEST_DTYPE(recvtype,rtype_ptr,comm_ptr,myname); MPIR_ERROR_PUSH(comm_ptr); mpi_errno = comm_ptr->collops->Scatter(sendbuf, sendcnt, stype_ptr, recvbuf, recvcnt, rtype_ptr, root, comm_ptr ); MPIR_ERROR_POP(comm_ptr); TR_POP; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
/*@ MPI_Allgather - Gathers data from all tasks and distribute it to all Input Parameters: + sendbuf - starting address of send buffer (choice) . sendcount - number of elements in send buffer (integer) . sendtype - data type of send buffer elements (handle) . recvcount - number of elements received from any process (integer) . recvtype - data type of receive buffer elements (handle) - comm - communicator (handle) Output Parameter: . recvbuf - address of receive buffer (choice) Notes: The MPI standard (1.0 and 1.1) says that The jth block of data sent from each proess is received by every process and placed in the jth block of the buffer 'recvbuf'. This is misleading; a better description is The block of data sent from the jth process is received by every process and placed in the jth block of the buffer 'recvbuf'. This text was suggested by Rajeev Thakur. .N fortran .N Errors .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_BUFFER @*/ int MPI_Allgather ( void *sendbuf, int sendcount, MPI_Datatype sendtype, void *recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm ) { int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr=0; struct MPIR_DATATYPE *stype_ptr=0, *rtype_ptr=0; MPIR_ERROR_DECL; static char myname[] = "MPI_ALLGATHER"; TR_PUSH(myname); comm_ptr = MPIR_GET_COMM_PTR(comm); stype_ptr = MPIR_GET_DTYPE_PTR(sendtype); rtype_ptr = MPIR_GET_DTYPE_PTR(recvtype); /* Check for invalid arguments */ #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr, myname); MPIR_TEST_DTYPE(sendtype,stype_ptr,comm_ptr, myname ); MPIR_TEST_DTYPE(recvtype,rtype_ptr,comm_ptr, myname ); MPIR_TEST_COUNT(sendcount); MPIR_TEST_COUNT(recvcount); if (mpi_errno) return MPIR_ERROR(comm_ptr, mpi_errno, myname ); #endif MPIR_ERROR_PUSH(comm_ptr); mpi_errno = comm_ptr->collops->Allgather( sendbuf, sendcount, stype_ptr, recvbuf, recvcount, rtype_ptr, comm_ptr ); MPIR_ERROR_POP(comm_ptr); TR_POP; MPIR_RETURN(comm_ptr,mpi_errno,myname); }
static int intra_Scan(void *sendbuf, void *recvbuf, int count, struct MPIR_DATATYPE *datatype, MPI_Op op, struct MPIR_COMMUNICATOR *comm) { MPI_Status status; int rank, size; int mpi_errno = MPI_SUCCESS; MPI_Aint lb, ub, m_extent; /* Extent in memory */ MPI_User_function *uop; struct MPIR_OP *op_ptr; MPIR_ERROR_DECL; mpi_comm_err_ret = 0; /* See the overview in Collection Operations for why this is ok */ if (count == 0) return MPI_SUCCESS; /* Get my rank & size and switch communicators to the hidden collective */ MPIR_Comm_size(comm, &size); MPIR_Comm_rank(comm, &rank); MPIR_Type_get_limits(datatype, &lb, &ub); m_extent = ub - lb; comm = comm->comm_coll; op_ptr = MPIR_GET_OP_PTR(op); MPIR_TEST_MPI_OP(op, op_ptr, comm, "MPI_SCAN"); uop = op_ptr->op; /* Lock for collective operation */ MPID_THREAD_LOCK(comm->ADIctx, comm); /* This is an O(size) algorithm. A modification of the algorithm in reduce.c can be used to make this O(log(size)) */ /* commutative case requires no extra buffering */ MPIR_Op_errno = MPI_SUCCESS; if (op_ptr->commute) { /* Do the scan operation */ if (rank > 0) { mpi_errno = MPI_Recv(recvbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; /* See reduce for why pop/push */ MPIR_ERROR_POP(comm); (*uop) (sendbuf, recvbuf, &count, &datatype->self); MPIR_ERROR_PUSH(comm); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* non-commutative case requires extra buffering */ else { /* Do the scan operation */ if (rank > 0) { void *tmpbuf; MPIR_ALLOC(tmpbuf, (void *) MALLOC(m_extent * count), comm, MPI_ERR_EXHAUSTED, "MPI_SCAN"); tmpbuf = (void *) ((char *) tmpbuf - lb); MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; mpi_errno = MPI_Recv(tmpbuf, count, datatype->self, rank - 1, MPIR_SCAN_TAG, comm->self, &status); if (mpi_errno) return mpi_errno; (*uop) (tmpbuf, recvbuf, &count, &datatype->self); FREE((char *) tmpbuf + lb); } else { MPIR_COPYSELF(sendbuf, count, datatype->self, recvbuf, MPIR_SCAN_TAG, rank, comm->self); if (mpi_errno) return mpi_errno; } } /* send the letter to destination */ if (rank < (size - 1)) mpi_errno = MPI_Send(recvbuf, count, datatype->self, rank + 1, MPIR_SCAN_TAG, comm->self); /* If the predefined operation detected an error, report it here */ if (mpi_errno == MPI_SUCCESS && MPIR_Op_errno) mpi_errno = MPIR_Op_errno; /* Unlock for collective operation */ MPID_THREAD_UNLOCK(comm->ADIctx, comm); return (mpi_errno); }
/*@ MPI_Intercomm_create - Creates an intercommuncator from two intracommunicators Input Paramters: + local_comm - Local (intra)communicator . local_leader - Rank in local_comm of leader (often 0) . peer_comm - Remote communicator . remote_leader - Rank in peer_comm of remote leader (often 0) - tag - Message tag to use in constructing intercommunicator; if multiple 'MPI_Intercomm_creates' are being made, they should use different tags (more precisely, ensure that the local and remote leaders are using different tags for each 'MPI_intercomm_create'). Output Parameter: . comm_out - Created intercommunicator Notes: The MPI 1.1 Standard contains two mutually exclusive comments on the input intracommunicators. One says that their repective groups must be disjoint; the other that the leaders can be the same process. After some discussion by the MPI Forum, it has been decided that the groups must be disjoint. Note that the `reason` given for this in the standard is `not` the reason for this choice; rather, the `other` operations on intercommunicators (like 'MPI_Intercomm_merge') do not make sense if the groups are not disjoint. .N fortran Algorithm: + 1) Allocate a send context, an inter-coll context, and an intra-coll context . 2) Send "send_context" and lrank_to_grank list from local comm group if I''m the local_leader. . 3) If I''m the local leader, then wait on the posted sends and receives to complete. Post the receive for the remote group information and wait for it to complete. . 4) Broadcast information received from the remote leader. . 5) Create the inter_communicator from the information we now have. - An inter-communicator ends up with three levels of communicators. The inter-communicator returned to the user, a "collective" inter-communicator that can be used for safe communications between local & remote groups, and a collective intra-communicator that can be used to allocate new contexts during the merge and dup operations. For the resulting inter-communicator, 'comm_out' .vb comm_out = inter-communicator comm_out->comm_coll = "collective" inter-communicator comm_out->comm_coll->comm_coll = safe collective intra-communicator .ve .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_TAG .N MPI_ERR_EXHAUSTED .N MPI_ERR_RANK .seealso: MPI_Intercomm_merge, MPI_Comm_free, MPI_Comm_remote_group, MPI_Comm_remote_size @*/ EXPORT_MPI_API int MPI_Intercomm_create ( MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm, int remote_leader, int tag, MPI_Comm *comm_out ) { int local_size, local_rank, peer_size, peer_rank; int remote_size; int mpi_errno = MPI_SUCCESS; MPIR_CONTEXT context, send_context; struct MPIR_GROUP *remote_group_ptr; struct MPIR_COMMUNICATOR *new_comm, *local_comm_ptr, *peer_comm_ptr; MPI_Request req[6]; MPI_Status status[6]; MPIR_ERROR_DECL; static char myname[]="MPI_INTERCOMM_CREATE"; TR_PUSH(myname); local_comm_ptr = MPIR_GET_COMM_PTR(local_comm); #ifndef MPIR_NO_ERROR_CHECKING /* Check for valid arguments to function */ MPIR_TEST_MPI_COMM(local_comm,local_comm_ptr,local_comm_ptr,myname); MPIR_TEST_SEND_TAG(tag); if (mpi_errno) return MPIR_ERROR(local_comm_ptr, mpi_errno, myname ); #endif if (local_comm == MPI_COMM_NULL) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_LOCAL_COMM, myname, "Local communicator must not be MPI_COMM_NULL", (char *)0 ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } (void) MPIR_Comm_size ( local_comm_ptr, &local_size ); (void) MPIR_Comm_rank ( local_comm_ptr, &local_rank ); if ( local_leader == local_rank ) { /* Peer_comm need be valid only at local_leader */ peer_comm_ptr = MPIR_GET_COMM_PTR(peer_comm); if ((MPIR_TEST_COMM_NOTOK(peer_comm,peer_comm_ptr) || (peer_comm == MPI_COMM_NULL))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_COMM, MPIR_ERR_PEER_COMM, myname, "Peer communicator is not valid", (char *)0 ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } (void) MPIR_Comm_size ( peer_comm_ptr, &peer_size ); (void) MPIR_Comm_rank ( peer_comm_ptr, &peer_rank ); if (((peer_rank == MPI_UNDEFINED) && (mpi_errno = MPI_ERR_RANK))) return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); if (((remote_leader >= peer_size) && (mpi_errno = MPI_ERR_RANK)) || ((remote_leader < 0) && (mpi_errno = MPI_ERR_RANK))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_REMOTE_RANK, myname, "Error specifying remote_leader", "Error specifying remote_leader; value %d not between 0 and %d", remote_leader, peer_size ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } } if (((local_leader >= local_size) && (mpi_errno = MPI_ERR_RANK)) || ((local_leader < 0) && (mpi_errno = MPI_ERR_RANK))) { mpi_errno = MPIR_Err_setmsg( MPI_ERR_RANK, MPIR_ERR_LOCAL_RANK, myname, "Error specifying local_leader", "Error specifying local_leader; value %d not in between 0 and %d", local_leader, local_size ); return MPIR_ERROR( local_comm_ptr, mpi_errno, myname ); } /* Allocate send context, inter-coll context and intra-coll context */ MPIR_Context_alloc ( local_comm_ptr, 3, &context ); /* If I'm the local leader, then exchange information */ if (local_rank == local_leader) { MPIR_ERROR_PUSH(peer_comm_ptr); /* Post the receives for the information from the remote_leader */ /* We don't post a receive for the remote group yet, because we */ /* don't know how big it is yet. */ MPIR_CALL_POP(MPI_Irecv (&remote_size, 1, MPI_INT, remote_leader, tag, peer_comm, &(req[2])),peer_comm_ptr,myname); MPIR_CALL_POP(MPI_Irecv (&send_context, 1, MPIR_CONTEXT_TYPE, remote_leader,tag, peer_comm, &(req[3])), peer_comm_ptr,myname); /* Send the lrank_to_grank table of the local_comm and an allocated */ /* context. Currently I use multiple messages to send this info. */ /* Eventually, this will change(?) */ MPIR_CALL_POP(MPI_Isend (&local_size, 1, MPI_INT, remote_leader, tag, peer_comm, &(req[0])),peer_comm_ptr,myname); MPIR_CALL_POP(MPI_Isend (&context, 1, MPIR_CONTEXT_TYPE, remote_leader, tag, peer_comm, &(req[1])),peer_comm_ptr,myname); /* Wait on the communication requests to finish */ MPIR_CALL_POP(MPI_Waitall ( 4, req, status ),peer_comm_ptr,myname); /* We now know how big the remote group is, so create it */ remote_group_ptr = MPIR_CreateGroup ( remote_size ); remote_group_ptr->self = (MPI_Group) MPIR_FromPointer( remote_group_ptr ); /* Post the receive for the group information */ MPIR_CALL_POP(MPI_Irecv (remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, remote_leader, tag, peer_comm, &(req[5])),peer_comm_ptr,myname); /* Send the local group info to the remote group */ MPIR_CALL_POP(MPI_Isend (local_comm_ptr->group->lrank_to_grank, local_size, MPI_INT, remote_leader, tag, peer_comm, &(req[4])),peer_comm_ptr,myname); /* wait on the send and the receive for the group information */ MPIR_CALL_POP(MPI_Waitall ( 2, &(req[4]), &(status[4]) ),peer_comm_ptr, myname); MPIR_ERROR_POP(peer_comm_ptr); /* Now we can broadcast the group information to the other local comm */ /* members. */ MPIR_ERROR_PUSH(local_comm_ptr); MPIR_CALL_POP(MPI_Bcast(&remote_size,1,MPI_INT,local_rank,local_comm), local_comm_ptr,myname); MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, local_rank, local_comm),local_comm_ptr, myname); MPIR_ERROR_POP(local_comm_ptr); } /* Else I'm just an ordinary comm member, so receive the bcast'd */ /* info about the remote group */ else { MPIR_ERROR_PUSH(local_comm_ptr); MPIR_CALL_POP(MPI_Bcast(&remote_size, 1, MPI_INT, local_leader, local_comm),local_comm_ptr,myname); /* We now know how big the remote group is, so create it */ remote_group_ptr = MPIR_CreateGroup ( remote_size ); remote_group_ptr->self = (MPI_Group) MPIR_FromPointer( remote_group_ptr ); /* Receive the group info */ MPIR_CALL_POP(MPI_Bcast(remote_group_ptr->lrank_to_grank, remote_size, MPI_INT, local_leader, local_comm), local_comm_ptr,myname ); MPIR_ERROR_POP(local_comm_ptr); } MPIR_ERROR_PUSH(local_comm_ptr); /* Broadcast the send context */ MPIR_CALL_POP(MPI_Bcast(&send_context, 1, MPIR_CONTEXT_TYPE, local_leader, local_comm),local_comm_ptr,myname); MPIR_ERROR_POP(local_comm_ptr); /* We all now have all the information necessary, start building the */ /* inter-communicator */ MPIR_ALLOC(new_comm,NEW(struct MPIR_COMMUNICATOR),local_comm_ptr, MPI_ERR_EXHAUSTED,myname ); MPIR_Comm_init( new_comm, local_comm_ptr, MPIR_INTER ); *comm_out = new_comm->self; new_comm->group = remote_group_ptr; MPIR_Group_dup( local_comm_ptr->group, &(new_comm->local_group) ); new_comm->local_rank = new_comm->local_group->local_rank; new_comm->lrank_to_grank = new_comm->group->lrank_to_grank; new_comm->np = new_comm->group->np; new_comm->send_context = send_context; new_comm->recv_context = context; new_comm->comm_name = 0; if ((mpi_errno = MPID_CommInit( local_comm_ptr, new_comm )) ) return mpi_errno; (void) MPIR_Attr_create_tree ( new_comm ); /* Build the collective inter-communicator */ MPIR_Comm_make_coll( new_comm, MPIR_INTER ); MPIR_Comm_make_onesided( new_comm, MPIR_INTER ); /* Build the collective intra-communicator. Note that we require an intra-communicator for the "coll_comm" so that MPI_COMM_DUP can use it for some collective operations (do we need this for MPI-2 with intercommunicator collective?) Note that this really isn't the right thing to do; we need to replace *all* of the Mississippi state collective code. */ MPIR_Comm_make_coll( new_comm->comm_coll, MPIR_INTRA ); #if 0 MPIR_Comm_make_coll( new_comm->comm_onesided, MPIR_INTRA ); #endif /* Remember it for the debugger */ MPIR_Comm_remember ( new_comm ); TR_POP; return (mpi_errno); }
/*@ MPI_Bsend - Basic send with user-specified buffering Input Parameters: + buf - initial address of send buffer (choice) . count - number of elements in send buffer (nonnegative integer) . datatype - datatype of each send buffer element (handle) . dest - rank of destination (integer) . tag - message tag (integer) - comm - communicator (handle) Notes: This send is provided as a convenience function; it allows the user to send messages without worring about where they are buffered (because the user `must` have provided buffer space with 'MPI_Buffer_attach'). In deciding how much buffer space to allocate, remember that the buffer space is not available for reuse by subsequent 'MPI_Bsend's unless you are certain that the message has been received (not just that it should have been received). For example, this code does not allocate enough buffer space .vb MPI_Buffer_attach( b, n*sizeof(double) + MPI_BSEND_OVERHEAD ); for (i=0; i<m; i++) { MPI_Bsend( buf, n, MPI_DOUBLE, ... ); } .ve because only enough buffer space is provided for a single send, and the loop may start a second 'MPI_Bsend' before the first is done making use of the buffer. In C, you can force the messages to be delivered by .vb MPI_Buffer_detach( &b, &n ); MPI_Buffer_attach( b, n ); .ve (The 'MPI_Buffer_detach' will not complete until all buffered messages are delivered.) .N fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_COMM .N MPI_ERR_COUNT .N MPI_ERR_TYPE .N MPI_ERR_RANK .N MPI_ERR_TAG .seealso: MPI_Buffer_attach, MPI_Ibsend, MPI_Bsend_init @*/ int MPI_Bsend( void *buf, int count, MPI_Datatype datatype, int dest, int tag, MPI_Comm comm ) { MPI_Request handle; MPI_Status status; int mpi_errno = MPI_SUCCESS; struct MPIR_COMMUNICATOR *comm_ptr; MPIR_ERROR_DECL; static char myname[] = "MPI_BSEND"; disableSignal(); TR_PUSH(myname); if (dest != MPI_PROC_NULL) { /* We should let Ibsend find the errors, but we will soon add a special case for faster Bsend and we'll need these tests then */ comm_ptr = MPIR_GET_COMM_PTR(comm); #ifndef MPIR_NO_ERROR_CHECKING MPIR_TEST_MPI_COMM(comm,comm_ptr,comm_ptr,myname); MPIR_TEST_COUNT(count); MPIR_TEST_SEND_TAG(tag); MPIR_TEST_SEND_RANK(comm_ptr,dest); if (mpi_errno) { revertSignal(); return MPIR_ERROR(comm_ptr, mpi_errno, myname ); } #endif /* ? BsendDatatype? MPID_BsendContig( comm, buf, len, src_lrank, tag, context_id, dest_grank, msgrep, &mpi_errno ); if (!mpi_errno) return MPI_SUCCESS; if (mpi_errno != MPIR_ERR_MAY_BLOCK) return MPIR_ERROR( comm, mpi_errno, myname ); */ MPIR_ERROR_PUSH(comm_ptr); /* We don't use MPIR_CALL_POP so that we can free the handle */ handle = MPI_REQUEST_NULL; if ((mpi_errno = MPI_Ibsend( buf, count, datatype, dest, tag, comm, &handle ))) { MPIR_ERROR_POP(comm_ptr); if (handle != MPI_REQUEST_NULL) MPID_SendFree( handle ); revertSignal(); return MPIR_ERROR(comm_ptr,mpi_errno,myname); } /* This Wait only completes the transfer of data into the buffer area. The test/wait in util/bsendutil.c completes the actual transfer */ MPIR_CALL_POP(MPI_Wait( &handle, &status ),comm_ptr,myname); MPIR_ERROR_POP(comm_ptr); } TR_POP; revertSignal(); return mpi_errno; }