static int nbc_bcast_init(void *buffer, int count, MPI_Datatype datatype, int root, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res, segsize; size_t size; NBC_Schedule *schedule; #ifdef NBC_CACHE_SCHEDULE NBC_Bcast_args *args, *found, search; #endif enum { NBC_BCAST_LINEAR, NBC_BCAST_BINOMIAL, NBC_BCAST_CHAIN } alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); if (1 == p) { return nbc_get_noop_request(persistent, request); } res = ompi_datatype_type_size(datatype, &size); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_size() (%i)", res); return res; } segsize = 16384; /* algorithm selection */ if( libnbc_ibcast_skip_dt_decision ) { if (p <= 4) { alg = NBC_BCAST_LINEAR; } else { alg = NBC_BCAST_BINOMIAL; } } else { if (p <= 4) { alg = NBC_BCAST_LINEAR; } else if (size * count < 65536) { alg = NBC_BCAST_BINOMIAL; } else if (size * count < 524288) { alg = NBC_BCAST_CHAIN; segsize = 8192; } else { alg = NBC_BCAST_CHAIN; segsize = 32768; } } #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.buffer = buffer; search.count = count; search.datatype = datatype; search.root = root; found = (NBC_Bcast_args *) hb_tree_search ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } switch(alg) { case NBC_BCAST_LINEAR: res = bcast_sched_linear(rank, p, root, schedule, buffer, count, datatype); break; case NBC_BCAST_BINOMIAL: res = bcast_sched_binomial(rank, p, root, schedule, buffer, count, datatype); break; case NBC_BCAST_CHAIN: res = bcast_sched_chain(rank, p, root, schedule, buffer, count, datatype, segsize, size); break; } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Bcast_args *) malloc (sizeof (args)); if (NULL != args) { args->buffer = buffer; args->count = count; args->datatype = datatype; args->root = root; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST], args, args, 0); if (0 == res) { OBJ_RETAIN (schedule); /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_BCAST] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_BCAST], &libnbc_module->NBC_Dict_size[NBC_BCAST]); } } else { NBC_Error("error in dict_insert() (%i)", res); free (args); } } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
static int nbc_allgather_init(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; MPI_Aint rcvext; NBC_Schedule *schedule; char *rbuf, inplace; #ifdef NBC_CACHE_SCHEDULE NBC_Allgather_args *args, *found, search; #endif enum { NBC_ALLGATHER_LINEAR, NBC_ALLGATHER_RDBL} alg; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); int is_commsize_pow2 = !(p & (p - 1)); if (libnbc_iallgather_algorithm == 0) { alg = NBC_ALLGATHER_LINEAR; } else { /* user forced dynamic decision */ if (libnbc_iallgather_algorithm == 1) { alg = NBC_ALLGATHER_LINEAR; } else if (libnbc_iallgather_algorithm == 2 && is_commsize_pow2) { alg = NBC_ALLGATHER_RDBL; } else { alg = NBC_ALLGATHER_LINEAR; } } res = ompi_datatype_type_extent(recvtype, &rcvext); if (MPI_SUCCESS != res) { return res; } if (inplace) { sendtype = recvtype; sendcount = recvcount; } else if (!persistent) { /* for persistent, the copy must be scheduled */ /* copy my data to receive buffer */ rbuf = (char *) recvbuf + rank * recvcount * rcvext; res = NBC_Copy (sendbuf, sendcount, sendtype, rbuf, recvcount, recvtype, comm); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (1 == p && (!persistent || inplace)) { return nbc_get_noop_request(persistent, request); } #ifdef NBC_CACHE_SCHEDULE /* search schedule in communicator specific tree */ search.sendbuf = sendbuf; search.sendcount = sendcount; search.sendtype = sendtype; search.recvbuf = recvbuf; search.recvcount = recvcount; search.recvtype = recvtype; found = (NBC_Allgather_args *) hb_tree_search ((hb_tree*)libnbc_module->NBC_Dict[NBC_ALLGATHER], &search); if (NULL == found) { #endif schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { return OMPI_ERR_OUT_OF_RESOURCE; } if (persistent && !inplace) { /* for nonblocking, data has been copied already */ /* copy my data to receive buffer (= send buffer of NBC_Sched_send) */ rbuf = (char *)recvbuf + rank * recvcount * rcvext; res = NBC_Sched_copy((void *)sendbuf, false, sendcount, sendtype, rbuf, false, recvcount, recvtype, schedule, true); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } switch (alg) { case NBC_ALLGATHER_LINEAR: res = allgather_sched_linear(rank, p, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype); break; case NBC_ALLGATHER_RDBL: res = allgather_sched_recursivedoubling(rank, p, schedule, sendbuf, sendcount, sendtype, recvbuf, recvcount, recvtype); break; } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } res = NBC_Sched_commit(schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } #ifdef NBC_CACHE_SCHEDULE /* save schedule to tree */ args = (NBC_Allgather_args *) malloc (sizeof (args)); args->sendbuf = sendbuf; args->sendcount = sendcount; args->sendtype = sendtype; args->recvbuf = recvbuf; args->recvcount = recvcount; args->recvtype = recvtype; args->schedule = schedule; res = hb_tree_insert ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLGATHER], args, args, 0); if (res != 0) { free (args); } else { OBJ_RETAIN(schedule); } /* increase number of elements for A2A */ if (++libnbc_module->NBC_Dict_size[NBC_ALLGATHER] > NBC_SCHED_DICT_UPPER) { NBC_SchedCache_dictwipe ((hb_tree *) libnbc_module->NBC_Dict[NBC_ALLGATHER], &libnbc_module->NBC_Dict_size[NBC_ALLGATHER]); } } else { /* found schedule */ schedule = found->schedule; OBJ_RETAIN(schedule); } #endif res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } return OMPI_SUCCESS; }
/* simple linear Alltoallv */ static int nbc_alltoallv_init(const void* sendbuf, const int *sendcounts, const int *sdispls, MPI_Datatype sendtype, void* recvbuf, const int *recvcounts, const int *rdispls, MPI_Datatype recvtype, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; MPI_Aint sndext, rcvext; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; ptrdiff_t gap = 0, span; void * tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); res = ompi_datatype_type_extent (recvtype, &rcvext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } /* copy data to receivbuffer */ if (inplace) { int count = 0; for (int i = 0; i < p; i++) { if (recvcounts[i] > count) { count = recvcounts[i]; } } span = opal_datatype_span(&recvtype->super, count, &gap); if (OPAL_UNLIKELY(0 == span)) { return nbc_get_noop_request(persistent, request); } tmpbuf = malloc(span); if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } sendcounts = recvcounts; sdispls = rdispls; } else { res = ompi_datatype_type_extent (sendtype, &sndext); if (MPI_SUCCESS != res) { NBC_Error("MPI Error in ompi_datatype_type_extent() (%i)", res); return res; } } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } if (!inplace && sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank] * rcvext; sbuf = (char *) sendbuf + sdispls[rank] * sndext; res = NBC_Sched_copy (sbuf, false, sendcounts[rank], sendtype, rbuf, false, recvcounts[rank], recvtype, schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); return res; } } if (inplace) { res = a2av_sched_inplace(rank, p, schedule, recvbuf, recvcounts, rdispls, rcvext, recvtype, gap); } else { res = a2av_sched_linear(rank, p, schedule, sendbuf, sendcounts, sdispls, sndext, sendtype, recvbuf, recvcounts, rdispls, rcvext, recvtype); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }
int NBC_Schedule_request(NBC_Schedule *schedule, ompi_communicator_t *comm, ompi_coll_libnbc_module_t *module, bool persistent, ompi_request_t **request, void *tmpbuf) { int ret, tmp_tag; bool need_register = false; ompi_coll_libnbc_request_t *handle; /* no operation (e.g. one process barrier)? */ if (((int *)schedule->data)[0] == 0 && schedule->data[sizeof(int)] == 0) { ret = nbc_get_noop_request(persistent, request); if (OMPI_SUCCESS != ret) { return OMPI_ERR_OUT_OF_RESOURCE; } /* update the module->tag here because other processes may have operations * and they may update the module->tag */ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } OPAL_THREAD_UNLOCK(&module->mutex); OBJ_RELEASE(schedule); free(tmpbuf); return OMPI_SUCCESS; } OMPI_COLL_LIBNBC_REQUEST_ALLOC(comm, persistent, handle); if (NULL == handle) return OMPI_ERR_OUT_OF_RESOURCE; handle->tmpbuf = NULL; handle->req_count = 0; handle->req_array = NULL; handle->comm = comm; handle->schedule = NULL; handle->row_offset = 0; handle->nbc_complete = persistent ? true : false; /******************** Do the tag and shadow comm administration ... ***************/ OPAL_THREAD_LOCK(&module->mutex); tmp_tag = module->tag--; if (tmp_tag == MCA_COLL_BASE_TAG_NONBLOCKING_END) { tmp_tag = module->tag = MCA_COLL_BASE_TAG_NONBLOCKING_BASE; NBC_DEBUG(2,"resetting tags ...\n"); } if (true != module->comm_registered) { module->comm_registered = true; need_register = true; } OPAL_THREAD_UNLOCK(&module->mutex); handle->tag = tmp_tag; /* register progress */ if (need_register) { int32_t tmp = OPAL_THREAD_ADD_FETCH32(&mca_coll_libnbc_component.active_comms, 1); if (tmp == 1) { opal_progress_register(ompi_coll_libnbc_progress); } } handle->comm=comm; /*printf("got module: %lu tag: %i\n", module, module->tag);*/ /******************** end of tag and shadow comm administration ... ***************/ handle->comminfo = module; NBC_DEBUG(3, "got tag %i\n", handle->tag); handle->tmpbuf = tmpbuf; handle->schedule = schedule; *request = (ompi_request_t *) handle; return OMPI_SUCCESS; }
/* simple linear Alltoallw */ static int nbc_alltoallw_init(const void* sendbuf, const int *sendcounts, const int *sdispls, struct ompi_datatype_t * const *sendtypes, void* recvbuf, const int *recvcounts, const int *rdispls, struct ompi_datatype_t * const *recvtypes, struct ompi_communicator_t *comm, ompi_request_t ** request, struct mca_coll_base_module_2_3_0_t *module, bool persistent) { int rank, p, res; NBC_Schedule *schedule; char *rbuf, *sbuf, inplace; ptrdiff_t span=0; void *tmpbuf = NULL; ompi_coll_libnbc_module_t *libnbc_module = (ompi_coll_libnbc_module_t*) module; NBC_IN_PLACE(sendbuf, recvbuf, inplace); rank = ompi_comm_rank (comm); p = ompi_comm_size (comm); /* copy data to receivbuffer */ if (inplace) { ptrdiff_t lgap, lspan; for (int i = 0; i < p; i++) { lspan = opal_datatype_span(&recvtypes[i]->super, recvcounts[i], &lgap); if (lspan > span) { span = lspan; } } if (OPAL_UNLIKELY(0 == span)) { return nbc_get_noop_request(persistent, request); } tmpbuf = malloc(span); if (OPAL_UNLIKELY(NULL == tmpbuf)) { return OMPI_ERR_OUT_OF_RESOURCE; } sendcounts = recvcounts; sdispls = rdispls; sendtypes = recvtypes; } schedule = OBJ_NEW(NBC_Schedule); if (OPAL_UNLIKELY(NULL == schedule)) { free(tmpbuf); return OMPI_ERR_OUT_OF_RESOURCE; } if (!inplace && sendcounts[rank] != 0) { rbuf = (char *) recvbuf + rdispls[rank]; sbuf = (char *) sendbuf + sdispls[rank]; res = NBC_Sched_copy(sbuf, false, sendcounts[rank], sendtypes[rank], rbuf, false, recvcounts[rank], recvtypes[rank], schedule, false); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { return res; } } if (inplace) { res = a2aw_sched_inplace(rank, p, schedule, recvbuf, recvcounts, rdispls, recvtypes); } else { res = a2aw_sched_linear(rank, p, schedule, sendbuf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes); } if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Sched_commit (schedule); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } res = NBC_Schedule_request(schedule, comm, libnbc_module, persistent, request, tmpbuf); if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) { OBJ_RELEASE(schedule); free(tmpbuf); return res; } return OMPI_SUCCESS; }