int ompi_coll_libnbc_ibarrier_inter(struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, rsize, peer; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Comm_remote_size(comm, &rsize); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } handle->tmpbuf=(void*)malloc(2*sizeof(char)); schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc()\n"); return res; } res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } if (0 == rank) { for (peer = 1 ; peer < rsize ; ++peer) { res = NBC_Sched_recv (0, true, 1, MPI_BYTE, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } /* synchronize with the remote root */ res = NBC_Sched_recv (0, true, 1, MPI_BYTE, 0, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_send (0, true, 1, MPI_BYTE, 0, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } if (0 == rank) { /* wait for the remote root */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* inform remote peers that all local peers have entered the barrier */ for (peer = 0 ; peer < rsize ; ++peer) { res = NBC_Sched_send (0, true, 1, MPI_BYTE, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } res = NBC_Sched_commit(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } return NBC_OK; }
/* * allgather_sched_linear * * Description: an implementation of Iallgather using linear algorithm * * Time: O(comm_size) * Schedule length (rounds): O(comm_size) */ static inline int allgather_sched_linear( int rank, int comm_size, NBC_Schedule *schedule, const void *sendbuf, int scount, struct ompi_datatype_t *sdtype, void *recvbuf, int rcount, struct ompi_datatype_t *rdtype) { int res = OMPI_SUCCESS; ptrdiff_t rlb, rext; res = ompi_datatype_get_extent(rdtype, &rlb, &rext); char *sbuf = (char *)recvbuf + rank * rcount * rext; for (int remote = 0; remote < comm_size ; ++remote) { if (remote != rank) { /* Recv from rank remote */ char *rbuf = (char *)recvbuf + remote * rcount * rext; res = NBC_Sched_recv(rbuf, false, rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } /* Send to rank remote - not from the sendbuf to optimize MPI_IN_PLACE */ res = NBC_Sched_send(sbuf, false, rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } } } cleanup_and_return: return res; }
__opal_attribute_unused__ static inline int a2av_sched_pairwise(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Aint sndext, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Aint rcvext, MPI_Datatype recvtype) { int res; for (int i = 1 ; i < p ; ++i) { int sndpeer = (rank + i) % p; int rcvpeer = (rank + p - i) %p; /* post send */ if (sendcounts[sndpeer] != 0) { char *sbuf = ((char *) sendbuf) + (sdispls[sndpeer] * sndext); res = NBC_Sched_send(sbuf, false, sendcounts[sndpeer], sendtype, sndpeer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ if (recvcounts[rcvpeer] != 0) { char *rbuf = ((char *) recvbuf) + (rdispls[rcvpeer] * rcvext); res = NBC_Sched_recv(rbuf, false, recvcounts[rcvpeer], recvtype, rcvpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
__opal_attribute_unused__ static inline int a2av_sched_linear(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, MPI_Aint sndext, MPI_Datatype sendtype, void *recvbuf, const int *recvcounts, const int *rdispls, MPI_Aint rcvext, MPI_Datatype recvtype) { int res; for (int i = 0 ; i < p ; ++i) { if (i == rank) { continue; } /* post send */ if (sendcounts[i] != 0) { char *sbuf = ((char *) sendbuf) + (sdispls[i] * sndext); res = NBC_Sched_send(sbuf, false, sendcounts[i], sendtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ if (recvcounts[i] != 0) { char *rbuf = ((char *) recvbuf) + (rdispls[i] * rcvext); res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
__opal_attribute_unused__ static inline int a2aw_sched_pairwise(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const * sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const * recvtypes) { int res; for (int i = 1; i < p; i++) { int sndpeer = (rank + i) % p; int rcvpeer = (rank + p - i) % p; /* post send */ if (sendcounts[sndpeer] != 0) { char *sbuf = (char *) sendbuf + sdispls[sndpeer]; res = NBC_Sched_send (sbuf, false, sendcounts[sndpeer], sendtypes[sndpeer], sndpeer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ if (recvcounts[rcvpeer] != 0) { char *rbuf = (char *) recvbuf + rdispls[rcvpeer]; res = NBC_Sched_recv (rbuf, false, recvcounts[rcvpeer], recvtypes[rcvpeer], rcvpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
static inline int a2aw_sched_linear(int rank, int p, NBC_Schedule *schedule, const void *sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const * sendtypes, void *recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const * recvtypes) { int res; for (int i = 0; i < p; i++) { ptrdiff_t gap, span; if (i == rank) { continue; } /* post send */ span = opal_datatype_span(&sendtypes[i]->super, sendcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* post receive */ span = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &gap); if (OPAL_LIKELY(0 < span)) { char *rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
static inline int a2a_sched_pairwise(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { int res, r, sndpeer, rcvpeer; char *rbuf, *sbuf; res = NBC_OK; if(p < 2) return res; for(r=1;r<p;r++) { sndpeer = (rank+r)%p; rcvpeer = (rank-r+p)%p; rbuf = ((char *) recvbuf) + (rcvpeer*recvcount*rcvext); res = NBC_Sched_recv(rbuf, false, recvcount, recvtype, rcvpeer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } sbuf = ((char *) sendbuf) + (sndpeer*sendcount*sndext); res = NBC_Sched_send(sbuf, false, sendcount, sendtype, sndpeer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } if (r < p) { res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barr() (%i)\n", res); return res; } } } return res; }
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, maxr, vpeer, peer, res; root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1 << (r - 1)); VRANK2RANK(peer, vpeer, root) if (peer < p) { /* we have to wait until we have the data */ res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ if (firstred && MPI_IN_PLACE != sendbuf) { /* perform the reduce with the senbuf */ res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); firstred = 0; } else { /* perform the reduce in my local buffer */ res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } else {
static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { int maxr, vrank, peer, r, res; maxr = (int)ceil((log((double)p)/LOG2)); RANK2VRANK(rank, vrank, root); /* receive from the right hosts */ if(vrank != 0) { for(r=0; r<maxr; r++) { if((vrank >= (1<<r)) && (vrank < (1<<(r+1)))) { VRANK2RANK(peer, vrank-(1<<r), root); res = NBC_Sched_recv(buffer, false, count, datatype, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } /* now send to the right hosts */ for(r=0; r<maxr; r++) { if(((vrank + (1<<r) < p) && (vrank < (1<<r))) || (vrank == 0)) { VRANK2RANK(peer, vrank+(1<<r), root); res = NBC_Sched_send(buffer, false, count, datatype, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } return NBC_OK; }
static inline int allred_sched_diss(int rank, int p, int count, MPI_Datatype datatype, void *sendbuf, void *recvbuf, MPI_Op op, NBC_Schedule *schedule, NBC_Handle *handle) { int root, vrank, r, maxr, firstred, vpeer, peer, res; root = 0; /* this makes the code for ireduce and iallreduce nearly identical - could be changed to improve performance */ RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log(p)/LOG2)); firstred = 1; for(r=1; r<=maxr; r++) { if((vrank % (1<<r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1<<(r-1)); VRANK2RANK(peer, vpeer, root) if(peer<p) { res = NBC_Sched_recv(0, true, count, datatype, peer, schedule); if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } if(firstred && MPI_IN_PLACE != sendbuf) { /* perform the reduce with the senbuf */ res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); firstred = 0; } else { /* perform the reduce in my local buffer */ res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); } if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if(res != NBC_OK) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } } else {
/* simple chained MPI_Ibcast */ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, size_t size) { int res, vrank, rpeer, speer, numfrag, fragcount, thiscount; MPI_Aint ext; char *buf; RANK2VRANK(rank, vrank, root); VRANK2RANK(rpeer, vrank-1, root); VRANK2RANK(speer, vrank+1, root); res = ompi_datatype_type_extent(datatype, &ext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } if (count == 0) { return OMPI_SUCCESS; } numfrag = count * size/fragsize; if ((count * size) % fragsize != 0) { numfrag++; } fragcount = count/numfrag; for (int fragnum = 0 ; fragnum < numfrag ; ++fragnum) { buf = (char *) buffer + fragnum * fragcount * ext; thiscount = fragcount; if (fragnum == numfrag-1) { /* last fragment may not be full */ thiscount = count - fragcount * fragnum; } /* root does not receive */ if (vrank != 0) { res = NBC_Sched_recv (buf, false, thiscount, datatype, rpeer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* last rank does not send */ if (vrank != p-1) { res = NBC_Sched_send (buf, false, thiscount, datatype, speer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* this barrier here seems awaward but isn't!!!! */ if (vrank == 0) { res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } } return OMPI_SUCCESS; }
/* simple linear Alltoallv */ int ompi_coll_libnbc_ialltoallv_inter (void* sendbuf, int *sendcounts, int *sdispls, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *rdispls, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int rank, res, i, rsize; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; char *rbuf, *sbuf; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Type_extent(sendtype, &sndext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } res = MPI_Type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } MPI_Comm_remote_size (comm, &rsize); schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } handle->tmpbuf=NULL; res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } for (i = 0; i < rsize; i++) { /* post all sends */ if(sendcounts[i] != 0) { sbuf = ((char *) sendbuf) + (sdispls[i] * sndext); res = NBC_Sched_send(sbuf, false, sendcounts[i], sendtype, i, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } /* post all receives */ if(recvcounts[i] != 0) { rbuf = ((char *) recvbuf) + (rdispls[i] * rcvext); res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } /*NBC_PRINT_SCHED(*schedule);*/ res = NBC_Sched_commit(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } return NBC_OK; }
static int nbc_allgather_inter_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int res, rsize; MPI_Aint rcvext; NBC_Schedule *schedule; char *rbuf; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error ("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } rsize = ompi_comm_remote_size (comm); /* set up schedule */ schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } /* do rsize - 1 rounds */ for (int r = 0 ; r < rsize ; ++r) { /* recv from rank r */ rbuf = (char *) recvbuf + r * recvcount * rcvext; res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } /* send to rank r */ res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, r, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
int ompi_coll_libnbc_igatherv_inter (void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int rank, p, res, i, rsize; MPI_Aint rcvext; NBC_Schedule *schedule; char *rbuf; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Comm_size(comm, &p); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } res = MPI_Comm_remote_size (comm, &rsize); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } if (MPI_ROOT == root) { res = MPI_Type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } } handle->tmpbuf = NULL; schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } /* send to root */ if (MPI_ROOT != root && MPI_PROC_NULL != root) { /* send msg to root */ res = NBC_Sched_send(sendbuf, false, sendcount, sendtype, root, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } else if (MPI_ROOT == root) { for (i = 0 ; i < rsize ; ++i) { rbuf = ((char *)recvbuf) + (displs[i]*rcvext); /* root receives message to the right buffer */ res = NBC_Sched_recv(rbuf, false, recvcounts[i], recvtype, i, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } res = NBC_Sched_commit(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } return NBC_OK; }
int ompi_coll_libnbc_ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int rank, p, res, size, peer; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Comm_size(comm, &p); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Type_size(datatype, &size); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_size() (%i)\n", res); return res; } handle->tmpbuf=NULL; schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create, res = %i\n", res); return res; } if(root != MPI_PROC_NULL) { /* send to all others */ if(root == MPI_ROOT) { int remsize; res = MPI_Comm_remote_size(comm, &remsize); if(MPI_SUCCESS != res) { printf("MPI_Comm_remote_size() failed\n"); return res; } for (peer=0;peer<remsize;peer++) { /* send msg to peer */ res = NBC_Sched_send(buffer, false, count, datatype, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } else { /* recv msg from root */ res = NBC_Sched_recv(buffer, false, count, datatype, root, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } res = NBC_Sched_commit(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } return NBC_OK; }
int ompi_coll_libnbc_iallgatherv_inter(void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int *recvcounts, int *displs, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, res, r, rsize; MPI_Aint rcvext; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Comm_remote_size(comm, &rsize); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_remote_size() (%i)\n", res); return res; } res = MPI_Type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc() (%i)\n", res); return res; } handle->tmpbuf=NULL; res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create, (%i)\n", res); return res; } /* do rsize rounds */ for (r = 0 ; r < rsize ; ++r) { char *rbuf = ((char *)recvbuf) + (displs[r]*rcvext); if (recvcounts[r]) { res = NBC_Sched_recv(rbuf, false, recvcounts[r], recvtype, r, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } } if (sendcount) { for (r = 0 ; r < rsize ; ++r) { res = NBC_Sched_send(sendbuf, false, sendcount, sendtype, r, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } res = NBC_Sched_commit(schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { printf("Error in NBC_Start() (%i)\n", res); return res; } return NBC_OK; }
/* simple linear Alltoallw */ static int nbc_alltoallw_inter_init (const void* sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const *sendtypes, void* recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const *recvtypes, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int res, rsize; NBC_Schedule *schedule; char *rbuf, *sbuf; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rsize = ompi_comm_remote_size (comm); schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } for (int i = 0 ; i < rsize ; ++i) { /* post all sends */ if (sendcounts[i] != 0) { sbuf = (char *) sendbuf + sdispls[i]; res = NBC_Sched_send (sbuf, false, sendcounts[i], sendtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } /* post all receives */ if (recvcounts[i] != 0) { rbuf = (char *) recvbuf + rdispls[i]; res = NBC_Sched_recv (rbuf, false, recvcounts[i], recvtypes[i], i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
int ompi_coll_libnbc_ibcast_inter(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_2_0_t *module) { int res; NBC_Schedule *schedule; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } if (root != MPI_PROC_NULL) { /* send to all others */ if (root == MPI_ROOT) { int remsize; remsize = ompi_comm_remote_size (comm); for (int peer = 0 ; peer < remsize ; ++peer) { /* send msg to peer */ res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } else { /* recv msg from root */ res = NBC_Sched_recv (buffer, false, count, datatype, root, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
static inline int red_sched_binomial (int rank, int p, int root, const void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { int vrank, vpeer, peer, res, maxr; RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); for (int r = 1, firstred = 1 ; r <= maxr ; ++r) { if ((vrank % (1 << r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1 << (r - 1)); VRANK2RANK(peer, vpeer, root) if (peer < p) { /* we have to wait until we have the data */ res = NBC_Sched_recv (0, true, count, datatype, peer, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ if (firstred) { if (rank == root) { /* root is the only one who reduces in the receivebuffer * take data from sendbuf in first round - save copy */ res = NBC_Sched_op (recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule, true); } else { /* all others may not have a receive buffer * take data from sendbuf in first round - save copy */ res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule, true); } firstred = 0; } else { if(rank == root) { /* root is the only one who reduces in the receivebuffer */ res = NBC_Sched_op (recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule, true); } else { /* all others may not have a receive buffer */ res = NBC_Sched_op ((char *) redbuf - (intptr_t) handle->tmpbuf, true, (char *) redbuf - (intptr_t) handle->tmpbuf, true, 0, true, count, datatype, op, schedule, true); } } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } else {
static inline int red_sched_binomial(int rank, int p, int root, void *sendbuf, void *recvbuf, int count, MPI_Datatype datatype, MPI_Op op, void *redbuf, NBC_Schedule *schedule, NBC_Handle *handle) { int firstred, vrank, vpeer, peer, res, maxr, r; RANK2VRANK(rank, vrank, root); maxr = (int)ceil((log((double)p)/LOG2)); firstred = 1; for(r=1; r<=maxr; r++) { if((vrank % (1<<r)) == 0) { /* we have to receive this round */ vpeer = vrank + (1<<(r-1)); VRANK2RANK(peer, vpeer, root) if(peer<p) { res = NBC_Sched_recv(0, true, count, datatype, peer, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* perform the reduce in my local buffer */ if(firstred) { if(rank == root) { /* root is the only one who reduces in the receivebuffer * take data from sendbuf in first round - save copy */ res = NBC_Sched_op(recvbuf, false, sendbuf, false, 0, true, count, datatype, op, schedule); } else { /* all others may not have a receive buffer * take data from sendbuf in first round - save copy */ res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); } firstred = 0; } else { if(rank == root) { /* root is the only one who reduces in the receivebuffer */ res = NBC_Sched_op(recvbuf, false, recvbuf, false, 0, true, count, datatype, op, schedule); } else { /* all others may not have a receive buffer */ res = NBC_Sched_op((char *)redbuf-(unsigned long)handle->tmpbuf, true, (char *)redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); } } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } } else {
/* simple chained MPI_Ibcast */ static inline int bcast_sched_chain(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype, int fragsize, int size) { int res, vrank, rpeer, speer, numfrag, fragnum, fragcount, thiscount; MPI_Aint ext; char *buf; RANK2VRANK(rank, vrank, root); VRANK2RANK(rpeer, vrank-1, root); VRANK2RANK(speer, vrank+1, root); res = MPI_Type_extent(datatype, &ext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } if(count == 0) return NBC_OK; numfrag = count*size/fragsize; if((count*size)%fragsize != 0) numfrag++; fragcount = count/numfrag; /*if(!rank) printf("numfrag: %i, count: %i, size: %i, fragcount: %i\n", numfrag, count, size, fragcount);*/ for(fragnum = 0; fragnum < numfrag; fragnum++) { buf = (char*)buffer+fragnum*fragcount*ext; thiscount = fragcount; if(fragnum == numfrag-1) { /* last fragment may not be full */ thiscount = count-fragcount*fragnum; } /* root does not receive */ if(vrank != 0) { res = NBC_Sched_recv(buf, false, thiscount, datatype, rpeer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } res = NBC_Sched_barrier(schedule); } /* last rank does not send */ if(vrank != p-1) { res = NBC_Sched_send(buf, false, thiscount, datatype, speer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* this barrier here seems awaward but isn't!!!! */ if(vrank == 0) res = NBC_Sched_barrier(schedule); } } return NBC_OK; }
static inline int a2a_sched_linear(int rank, int p, MPI_Aint sndext, MPI_Aint rcvext, NBC_Schedule* schedule, void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, MPI_Comm comm) { int res, r; char *rbuf, *sbuf; res = NBC_OK; for(r=0;r<p;r++) { /* easy algorithm */ if (r == rank) { continue; } rbuf = ((char *) recvbuf) + (r*recvcount*rcvext); res = NBC_Sched_recv(rbuf, false, recvcount, recvtype, r, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } sbuf = ((char *) sendbuf) + (r*sendcount*sndext); res = NBC_Sched_send(sbuf, false, sendcount, sendtype, r, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } return res; }
/* simple linear MPI_Ibcast */ static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { int peer, res; /* send to all others */ if(rank == root) { for (peer=0; peer<p;peer++) { if(peer != root) { /* send msg to peer */ res = NBC_Sched_send(buffer, false, count, datatype, peer, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } } } else { /* recv msg from root */ res = NBC_Sched_recv(buffer, false, count, datatype, root, schedule); if (NBC_OK != res) { printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } return NBC_OK; }
static inline int bcast_sched_binomial(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { int maxr, vrank, peer, res; maxr = (int)ceil((log((double)p)/LOG2)); RANK2VRANK(rank, vrank, root); /* receive from the right hosts */ if (vrank != 0) { for (int r = 0 ; r < maxr ; ++r) { if ((vrank >= (1 << r)) && (vrank < (1 << (r + 1)))) { VRANK2RANK(peer, vrank - (1 << r), root); res = NBC_Sched_recv (buffer, false, count, datatype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } res = NBC_Sched_barrier (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } /* now send to the right hosts */ for (int r = 0 ; r < maxr ; ++r) { if (((vrank + (1 << r) < p) && (vrank < (1 << r))) || (vrank == 0)) { VRANK2RANK(peer, vrank + (1 << r), root); res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } return OMPI_SUCCESS; }
/* simple linear MPI_Ibcast */ static inline int bcast_sched_linear(int rank, int p, int root, NBC_Schedule *schedule, void *buffer, int count, MPI_Datatype datatype) { int res; /* send to all others */ if(rank == root) { for (int peer = 0 ; peer < p ; ++peer) { if (peer != root) { /* send msg to peer */ res = NBC_Sched_send (buffer, false, count, datatype, peer, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } } } else { /* recv msg from root */ res = NBC_Sched_recv (buffer, false, count, datatype, root, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } return OMPI_SUCCESS; }
/* * allgather_sched_recursivedoubling * * Description: an implementation of Iallgather using recursive doubling algorithm * Limitation: power-of-two number of processes only * Time: O(log(comm_size)) * Schedule length (rounds): O(log(comm_size)) * Memory: no additional memory requirements beyond user-supplied buffers. * * Example on 4 nodes: * Initialization: everyone has its own buffer at location rank in rbuf * # 0 1 2 3 * [0] [ ] [ ] [ ] * [ ] [1] [ ] [ ] * [ ] [ ] [2] [ ] * [ ] [ ] [ ] [3] * Step 0: exchange data with (rank ^ 2^0) * # 0 1 2 3 * [0] [0] [ ] [ ] * [1] [1] [ ] [ ] * [ ] [ ] [2] [2] * [ ] [ ] [3] [3] * Step 1: exchange data with (rank ^ 2^1) (if you can) * # 0 1 2 3 * [0] [0] [0] [0] * [1] [1] [1] [1] * [2] [2] [2] [2] * [3] [3] [3] [3] * */ static inline int allgather_sched_recursivedoubling( int rank, int comm_size, NBC_Schedule *schedule, const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype) { int res = OMPI_SUCCESS; ptrdiff_t rlb, rext; char *tmpsend = NULL, *tmprecv = NULL; res = ompi_datatype_get_extent(rdtype, &rlb, &rext); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } int sendblocklocation = rank; for (int distance = 1; distance < comm_size; distance <<= 1) { int remote = rank ^ distance; tmpsend = (char *)rbuf + (ptrdiff_t)sendblocklocation * (ptrdiff_t)rcount * rext; if (rank < remote) { tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation + distance) * (ptrdiff_t)rcount * rext; } else { tmprecv = (char *)rbuf + (ptrdiff_t)(sendblocklocation - distance) * (ptrdiff_t)rcount * rext; sendblocklocation -= distance; } res = NBC_Sched_send(tmpsend, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } res = NBC_Sched_recv(tmprecv, false, (ptrdiff_t)distance * (ptrdiff_t)rcount, rdtype, remote, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { goto cleanup_and_return; } } cleanup_and_return: return res; }
/* linear iscan * working principle: * 1. each node (but node 0) receives from left neighbor * 2. performs op * 3. all but rank p-1 do sends to it's right neighbor and exits * */ int ompi_coll_libnbc_iscan(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; ptrdiff_t gap, span; NBC_Schedule *schedule; char inplace; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); if (!inplace) { /* copy data to receivebuf */ res = NBC_Copy (sendbuf, count, datatype, recvbuf, count, datatype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } res = NBC_Init_handle(comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } #ifdef NBC_CACHE_SCHEDULE NBC_Scan_args *args, *found, search; /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.recvbuf = recvbuf; search.count = count; search.datatype = datatype; search.op = op; found = (NBC_Scan_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* ensure the schedule is released with the handle */ handle->schedule = schedule; if(rank != 0) { span = opal_datatype_span(&datatype->super, count, &gap); handle->tmpbuf = malloc (span); if (NULL == handle->tmpbuf) { NBC_Return_handle (handle); return OMPI_ERR_OUT_OF_RESOURCE; } /* we have to wait until we have the data */ res = NBC_Sched_recv ((void *)(-gap), true, count, datatype, rank-1, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } /* perform the reduce in my local buffer */ /* this cannot be done until handle->tmpbuf is unused :-( so barrier after the op */ res = NBC_Sched_op ((void *)(-gap), true, recvbuf, false, count, datatype, op, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } if (rank != p-1) { res = NBC_Sched_send (recvbuf, false, count, datatype, rank+1, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Scan_args *) malloc (sizeof (args)); if (NULL != args) { args->sendbuf = sendbuf; args->recvbuf = recvbuf; args->count = count; args->datatype = datatype; args->op = op; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_SCAN] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_SCAN], &libnbc_module->NBC_Dict_size[NBC_SCAN]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Start(handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; /* tmpbuf is freed with the handle */ return OMPI_SUCCESS; }
int ompi_coll_libnbc_igather_inter (const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int res, rsize; MPI_Aint rcvext = 0; NBC_Schedule *schedule; char *rbuf; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rsize = ompi_comm_remote_size (comm); if (root == MPI_ROOT) { res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } /* send to root */ if (root != MPI_ROOT && root != MPI_PROC_NULL) { /* send msg to root */ res = NBC_Sched_send (sendbuf, false, sendcount, sendtype, root, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } else if (MPI_ROOT == root) { for (int i = 0 ; i < rsize ; ++i) { rbuf = ((char *)recvbuf) + (i * recvcount * rcvext); /* root receives message to the right buffer */ res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
int ompi_coll_libnbc_igather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_1_0_t *module) { int rank, p, res; MPI_Aint rcvext = 0; NBC_Schedule *schedule; char *rbuf, inplace = 0; NBC_Handle *handle; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); if (root == rank) { NBC_IN_PLACE(sendbuf, recvbuf, inplace); } p = ompi_comm_size (comm); if (rank == root) { res = ompi_datatype_type_extent (recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } } if (inplace) { sendcount = recvcount; sendtype = recvtype; } else if (rank == root) { rbuf = ((char *)recvbuf) + (rank*recvcount*rcvext); /* if I am the root - just copy the message (only without MPI_IN_PLACE) */ res = NBC_Copy(sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } #ifdef NBC_CACHE_SCHEDULE NBC_Gather_args *args, *found, search; /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.sendcount = sendcount; search.sendtype = sendtype; search.recvbuf = recvbuf; search.recvcount = recvcount; search.recvtype = recvtype; search.root = root; found = (NBC_Gather_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_GATHER], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } /* send to root */ if (rank != root) { /* send msg to root */ res = NBC_Sched_send(sendbuf, false, sendcount, sendtype, root, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } else { for (int i = 0 ; i < p ; ++i) { rbuf = (char *)recvbuf + i * recvcount * rcvext; if (i != root) { /* root receives message to the right buffer */ res = NBC_Sched_recv (rbuf, false, recvcount, recvtype, i, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } } } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Gather_args *) malloc (sizeof (args)); if (NULL != args) { args->sendbuf = sendbuf; args->sendcount = sendcount; args->sendtype = sendtype; args->recvbuf = recvbuf; args->recvcount = recvcount; args->recvtype = recvtype; args->root = root; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_GATHER], args, args, 0); if (0 == res) { OBJ_RETAIN(schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_GATHER] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_GATHER], &libnbc_module->NBC_Dict_size[NBC_GATHER]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Init_handle (comm, &handle, libnbc_module); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Start (handle, schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { NBC_Return_handle (handle); return res; } *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
int ompi_coll_libnbc_ireduce_scatter(void* sendbuf, void* recvbuf, int *recvcounts, MPI_Datatype datatype, MPI_Op op, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_0_0_t *module) { int peer, rank, maxr, p, r, res, count, offset, firstred; MPI_Aint ext; char *redbuf, *sbuf, inplace; NBC_Schedule *schedule; NBC_Handle *handle; ompi_coll_libnbc_request_t **coll_req = (ompi_coll_libnbc_request_t**) request; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); res = NBC_Init_handle(comm, coll_req, libnbc_module); if(res != NBC_OK) { printf("Error in NBC_Init_handle(%i)\n", res); return res; } handle = (*coll_req); res = MPI_Comm_rank(comm, &rank); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_rank() (%i)\n", res); return res; } res = MPI_Comm_size(comm, &p); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Comm_size() (%i)\n", res); return res; } MPI_Type_extent(datatype, &ext); if (MPI_SUCCESS != res) { printf("MPI Error in MPI_Type_extent() (%i)\n", res); return res; } schedule = (NBC_Schedule*)malloc(sizeof(NBC_Schedule)); if (NULL == schedule) { printf("Error in malloc()\n"); return NBC_OOR; } res = NBC_Sched_create(schedule); if(res != NBC_OK) { printf("Error in NBC_Sched_create (%i)\n", res); return res; } maxr = (int)ceil((log(p)/LOG2)); count = 0; for(r=0;r<p;r++) count += recvcounts[r]; handle->tmpbuf = malloc(ext*count*2); if(handle->tmpbuf == NULL) { printf("Error in malloc()\n"); return NBC_OOR; } redbuf = ((char*)handle->tmpbuf)+(ext*count); /* copy data to redbuf if we only have a single node */ if((p==1) && !inplace) { res = NBC_Copy(sendbuf, count, datatype, redbuf, count, datatype, comm); if (NBC_OK != res) { printf("Error in NBC_Copy() (%i)\n", res); return res; } } firstred = 1; for(r=1; r<=maxr; r++) { if((rank % (1<<r)) == 0) { /* we have to receive this round */ peer = rank + (1<<(r-1)); if(peer<p) { res = NBC_Sched_recv(0, true, count, datatype, peer, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } /* we have to wait until we have the data */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } if(firstred) { /* take reduce data from the sendbuf in the first round -> save copy */ res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, sendbuf, false, 0, true, count, datatype, op, schedule); firstred = 0; } else { /* perform the reduce in my local buffer */ res = NBC_Sched_op(redbuf-(unsigned long)handle->tmpbuf, true, redbuf-(unsigned long)handle->tmpbuf, true, 0, true, count, datatype, op, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_op() (%i)\n", res); return res; } /* this cannot be done until handle->tmpbuf is unused :-( */ res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } } } else { /* we have to send this round */ peer = rank - (1<<(r-1)); if(firstred) { /* we have to send the senbuf */ res = NBC_Sched_send(sendbuf, false, count, datatype, peer, schedule); } else { /* we send an already reduced value from redbuf */ res = NBC_Sched_send(redbuf-(unsigned long)handle->tmpbuf, true, count, datatype, peer, schedule); } if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } /* leave the game */ break; } } res = NBC_Sched_barrier(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_barrier() (%i)\n", res); return res; } /* rank 0 is root and sends - all others receive */ if(rank != 0) { res = NBC_Sched_recv(recvbuf, false, recvcounts[rank], datatype, 0, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_recv() (%i)\n", res); return res; } } if(rank == 0) { offset = 0; for(r=1;r<p;r++) { offset += recvcounts[r-1]; sbuf = ((char *)redbuf) + (offset*ext); /* root sends the right buffer to the right receiver */ res = NBC_Sched_send(sbuf-(unsigned long)handle->tmpbuf, true, recvcounts[r], datatype, r, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_send() (%i)\n", res); return res; } } res = NBC_Sched_copy(redbuf-(unsigned long)handle->tmpbuf, true, recvcounts[0], datatype, recvbuf, false, recvcounts[0], datatype, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_copy() (%i)\n", res); return res; } } /*NBC_PRINT_SCHED(*schedule);*/ res = NBC_Sched_commit(schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Sched_commit() (%i)\n", res); return res; } res = NBC_Start(handle, schedule); if (NBC_OK != res) { free(handle->tmpbuf); printf("Error in NBC_Start() (%i)\n", res); return res; } /* tmpbuf is freed with the handle */ return NBC_OK; }