int down_search(int rank, int parent, int me, int num_procs, int *num_children, opal_list_t *children, opal_bitmap_t *relatives) { int i, bitmap, peer, hibit, mask, found; orte_routed_tree_t *child; opal_bitmap_t *relations; /* is this me? */ if (me == rank) { bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { child = OBJ_NEW(orte_routed_tree_t); child->vpid = peer; if (NULL != children) { /* this is a direct child - add it to my list */ opal_list_append(children, &child->super); (*num_children)++; /* setup the relatives bitmap */ opal_bitmap_init(&child->relatives, num_procs); /* point to the relatives */ relations = &child->relatives; } else { /* we are recording someone's relatives - set the bit */ opal_bitmap_set_bit(relatives, peer); /* point to this relations */ relations = relatives; } /* search for this child's relatives */ down_search(0, 0, peer, num_procs, NULL, NULL, relations); } } return parent; } /* find the children of this rank */ bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { /* execute compute on this child */ if (0 <= (found = down_search(peer, rank, me, num_procs, num_children, children, relatives))) { return found; } } } return -1; }
/* * bcast_log_intra * * Function: - broadcast using O(log(N)) algorithm * Accepts: - same arguments as MPI_Bcast() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_bcast_log_intra(void *buff, int count, struct ompi_datatype_t *datatype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int size; int rank; int vrank; int peer; int dim; int hibit; int mask; int err; int nreqs; ompi_request_t **preq; mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module; ompi_request_t **reqs = basic_module->mccb_reqs; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); vrank = (rank + size - root) % size; dim = comm->c_cube_dim; hibit = opal_hibit(vrank, dim); --dim; /* Receive data from parent in the tree. */ if (vrank > 0) { peer = ((vrank & ~(1 << hibit)) + root) % size; err = MCA_PML_CALL(recv(buff, count, datatype, peer, MCA_COLL_BASE_TAG_BCAST, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } /* Send data to the children. */ err = MPI_SUCCESS; preq = reqs; nreqs = 0; for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer = vrank | mask; if (peer < size) { peer = (peer + root) % size; ++nreqs; err = MCA_PML_CALL(isend_init(buff, count, datatype, peer, MCA_COLL_BASE_TAG_BCAST, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { mca_coll_basic_free_reqs(reqs, nreqs); return err; } } } /* Start and wait on all requests. */ if (nreqs > 0) { /* Start your engines. This will never return an error. */ MCA_PML_CALL(start(nreqs, reqs)); /* Wait for them all. If there's an error, note that we don't * care what the error was -- just that there *was* an error. * The PML will finish all requests, even if one or more of them * fail. i.e., by the end of this call, all the requests are * free-able. So free them anyway -- even if there was an * error, and return the error after we free everything. */ err = ompi_request_wait_all(nreqs, reqs, MPI_STATUSES_IGNORE); /* Free the reqs */ mca_coll_basic_free_reqs(reqs, nreqs); } /* All done */ return err; }
static int mca_oob_xcast_binomial_tree(orte_jobid_t job, bool process_first, orte_buffer_t* buffer, orte_gpr_trigger_cb_fn_t cbfunc) { orte_std_cntr_t i; int rc; int tag = ORTE_RML_TAG_XCAST; int peer, size, rank, hibit, mask; orte_buffer_t rbuf, sbuf; orte_gpr_notify_message_t *msg; orte_process_name_t target; /* check to see if there is something to send - this is only true on the HNP end. * However, we cannot just test to see if we are the HNP since, if we are a singleton, * we are the HNP *and* we still need to handle both ends of the xcast */ if (NULL != buffer) { /* this is the HNP end, so it starts the procedure. Accordingly, it sends its * message to the first process in the job in the peer list, which takes it from there */ OBJ_CONSTRUCT(&xcastmutex, opal_mutex_t); OPAL_THREAD_LOCK(&xcastmutex); target.cellid = ORTE_PROC_MY_NAME->cellid; target.jobid = job; target.vpid = 0; if (0 > (rc = mca_oob_send_packed(&target, buffer, tag, 0))) { ORTE_ERROR_LOG(rc); OPAL_THREAD_UNLOCK(&xcastmutex); OBJ_DESTRUCT(&xcastmutex); return rc; } OPAL_THREAD_UNLOCK(&xcastmutex); OBJ_DESTRUCT(&xcastmutex); return ORTE_SUCCESS; } /* this process is one of the application procs - accordingly, it will * receive the message from its "parent" in the broadcast tree, and * then send it along to some set of children */ /* compute the bitmap, if we haven't already done so */ if (!bitmap_init) { bitmap_save = opal_cube_dim((int)orte_process_info.num_procs); bitmap_init = true; } xcast_bitmap = bitmap_save; rank = (int)(ORTE_PROC_MY_NAME->vpid); size = (int)orte_process_info.num_procs; hibit = opal_hibit(rank, xcast_bitmap); --xcast_bitmap; /* regardless of who we are, we first have to receive the message */ OBJ_CONSTRUCT(&rbuf, orte_buffer_t); if (0 > (rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&rbuf); return rc; } msg = OBJ_NEW(orte_gpr_notify_message_t); if (NULL == msg) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } i=1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(&rbuf, &msg, &i, ORTE_GPR_NOTIFY_MSG))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(msg); return rc; } OBJ_DESTRUCT(&rbuf); /* repack the message so we can send it on */ OBJ_CONSTRUCT(&sbuf, orte_buffer_t); if (ORTE_SUCCESS != (rc = orte_dss.pack(&sbuf, &msg, 1, ORTE_GPR_NOTIFY_MSG))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&sbuf); return rc; } /* since the OOB contact info for our peers is in the STG1 message, we have to * process it BEFORE we can relay the message to any "children" */ if (cbfunc != NULL && process_first) { /* process the message */ cbfunc(msg); } /* send data to any children */ target.cellid = ORTE_PROC_MY_NAME->cellid; target.jobid = ORTE_PROC_MY_NAME->jobid; for (i = hibit + 1, mask = 1 << i; i <= xcast_bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < size) { target.vpid = (orte_vpid_t)peer; if (0 > (rc = mca_oob_send_packed(&target, &sbuf, tag, 0))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(msg); return rc; } } } OBJ_DESTRUCT(&sbuf); /* if it wasn't the STG1 message, then process it here */ if (cbfunc != NULL && !process_first) { cbfunc(msg); } OBJ_RELEASE(msg); return ORTE_SUCCESS; }
static int binomial_tree(int rank, int parent, int me, int num_procs, int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives, bool mine) { int i, bitmap, peer, hibit, mask, found; orte_routed_tree_t *child; opal_bitmap_t *relations; OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output, "%s routed:binomial rank %d parent %d me %d num_procs %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank, parent, me, num_procs)); /* is this me? */ if (me == rank) { bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { child = OBJ_NEW(orte_routed_tree_t); child->vpid = peer; OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output, "%s routed:binomial %d found child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank, ORTE_VPID_PRINT(child->vpid))); if (mine) { /* this is a direct child - add it to my list */ opal_list_append(childrn, &child->super); (*nchildren)++; /* setup the relatives bitmap */ opal_bitmap_init(&child->relatives, num_procs); /* point to the relatives */ relations = &child->relatives; } else { /* we are recording someone's relatives - set the bit */ opal_bitmap_set_bit(relatives, peer); /* point to this relations */ relations = relatives; } /* search for this child's relatives */ binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false); } } return parent; } /* find the children of this rank */ OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children of rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children checking peer %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); if (peer < num_procs) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children computing tree", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute compute on this child */ if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children returning found value %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found)); return found; } } } return -1; }
/** adapted from openmpi see, ~/openmpi-1.6.4/ompi/mca/coll/basic/coll_basic_bcast.c This does a binomial tree based broadcast. */ int MPI_Bcast_log( void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm ) { int i; int size; int rank; int vrank; int peer; int dim; int hibit; int mask; int err; int rcnt=0; MPI_Request *areqs = NULL; MPI_CHECK( err = MPI_Comm_rank( comm, &rank ) ); MPI_CHECK( err = MPI_Comm_size( comm, &size ) ); vrank = (rank + size - root) % size; dim = opal_cube_dim(size); hibit = opal_hibit(vrank, dim); --dim; /* malloc the maximum possible send request, this is the dimension minus the depth then one more the leaf, this might allocate a more slots than necessary if the number of nodes do not evenly divide into the tree. */ NULL_CHECK( areqs = malloc( sizeof( MPI_Request ) * ( dim-(hibit+1)+1 ) ) ); /* Receive data from parent in the tree. */ if (vrank > 0) { peer = ((vrank & ~(1 << hibit)) + root) % size; // printf("Getting ready to receive from %d at %d\n", peer, rank); MPI_CHECK( err = MPI_Recv( buff, count, datatype, peer, BCAST_TAG, comm, MPI_STATUS_IGNORE ) ); if (MPI_SUCCESS != err) { return err; } } /* Send data to the children. */ // just in case there are no children, return should be success err = MPI_SUCCESS; for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer = vrank | mask; if (peer < size) { peer = (peer + root) % size; MPI_CHECK( err = MPI_Isend( buff, count, datatype, peer, BCAST_TAG, comm, &areqs[ rcnt ] ) ); rcnt++; if (MPI_SUCCESS != err) { free(areqs); return err; } } } /* Wait on all requests. */ if (rcnt > 0) { MPI_CHECK( err = MPI_Waitall( rcnt, areqs, MPI_STATUSES_IGNORE ) ); /* Free the reqs */ free(areqs); } /* All done */ return err; }
/* * barrier_intra_log * * Function: - barrier using O(log(N)) algorithm * Accepts: - same as MPI_Barrier() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_barrier_intra_log(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i; int err; int peer; int dim; int hibit; int mask; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); /* Send null-messages up and down the tree. Synchronization at the * root (rank 0). */ dim = comm->c_cube_dim; hibit = opal_hibit(rank, dim); --dim; /* Receive from children. */ for (i = dim, mask = 1 << i; i > hibit; --i, mask >>= 1) { peer = rank | mask; if (peer < size) { err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } } /* Send to and receive from parent. */ if (rank > 0) { peer = rank & ~(1 << hibit); err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } err = MCA_PML_CALL(recv(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { return err; } } /* Send to children. */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer = rank | mask; if (peer < size) { err = MCA_PML_CALL(send(NULL, 0, MPI_BYTE, peer, MCA_COLL_BASE_TAG_BARRIER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { return err; } } } /* All done */ return MPI_SUCCESS; }
int ompi_coll_portals4_barrier_intra(struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_portals4_module_t *portals4_module = (mca_coll_portals4_module_t*) module; int ret, i, dim, hibit, mask, num_msgs; int size = ompi_comm_size(comm); int rank = ompi_comm_rank(comm); ptl_ct_event_t ct; ptl_handle_ct_t ct_h; ptl_handle_me_t me_h; ptl_me_t me; size_t count; ptl_match_bits_t match_bits; ptl_handle_md_t md_h; void *base; ompi_coll_portals4_get_md(0, &md_h, &base); count = opal_atomic_add_size_t(&portals4_module->barrier_count, 1); ret = PtlCTAlloc(mca_coll_portals4_component.ni_h, &ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } COLL_PORTALS4_SET_BITS(match_bits, ompi_comm_get_cid(comm), 0, COLL_PORTALS4_BARRIER, count); /* Build "tree" out of hypercube */ dim = comm->c_cube_dim; hibit = opal_hibit(rank, dim); --dim; /* receive space */ me.start = NULL; me.length = 0; me.ct_handle = ct_h; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE | PTL_ME_EVENT_CT_COMM | PTL_ME_EVENT_CT_OVERFLOW; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = match_bits; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx, &me, PTL_PRIORITY_LIST, NULL, &me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* calculate number of children to receive from */ num_msgs = ompi_coll_portals4_get_nchildren(dim + 1, hibit, rank, size); /* send to parent when children have sent to us */ if (rank > 0) { int parent = rank & ~(1 << hibit); ret = PtlTriggeredPut(md_h, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, parent), mca_coll_portals4_component.pt_idx, match_bits, 0, NULL, 0, ct_h, num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlTriggeredPut failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* we'll need to wait for the parent response before the next set of comms */ num_msgs++; } /* send to children when parent (or all children if root) has sent to us */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { int peer = rank | mask; if (peer < size) { ret = PtlTriggeredPut(md_h, 0, 0, PTL_NO_ACK_REQ, ompi_coll_portals4_get_peer(comm, peer), mca_coll_portals4_component.pt_idx, match_bits, 0, NULL, 0, ct_h, num_msgs); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlTriggeredPut failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } } } /* Wait for all incoming messages */ ret = PtlCTWait(ct_h, num_msgs, &ct); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTWait failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* cleanup */ ret = PtlMEUnlink(me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEUnlink failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlCTFree(ct_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlCTFree failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } return OMPI_SUCCESS; }
/* The Binomial Spanning Tree algorithm. Outlay: The game scales with log2(NP) and uses 1 byte of memory. */ static int __algorithm_binomial_tree(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync) { int rc = OSHMEM_SUCCESS; long value = SHMEM_SYNC_INIT; int root_id = oshmem_proc_group_find_id(group, PE_root); int my_id = oshmem_proc_group_find_id(group, group->my_pe); int peer_id = 0; int peer_pe = 0; int vrank; int dim = opal_cube_dim(group->proc_count); int hibit; int mask; int i = 0; SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe); SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld root = #%d", group->my_pe, pSync[0], PE_root); vrank = (my_id + group->proc_count - root_id) % group->proc_count; hibit = opal_hibit(vrank, dim); SCOLL_VERBOSE(15, "[#%d] dim = %d vrank = %d hibit = %d", group->my_pe, dim, vrank, hibit); dim--; pSync[0] = SHMEM_SYNC_READY; /* Receive data from parent in the tree. */ if (vrank > 0) { value = SHMEM_SYNC_READY; SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); while ((value = pSync[0]) < 0) { SCOLL_VERBOSE(14, "[#%d] Broadcast size is a negative value (%li)\n", group->my_pe, pSync[0]); MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); } if (OSHMEM_SUCCESS != rc) { return rc; } nlong = (size_t) pSync[0]; } /* Send data to the children. */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer_id = vrank | mask; if (peer_id < group->proc_count) { /* Wait for the child to be ready to receive (pSync must have the initial value) */ peer_id = (peer_id + root_id) % group->proc_count; peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); SCOLL_VERBOSE(14, "[#%d] check remote pe is ready to receive #%d", group->my_pe, peer_pe); do { rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe)); } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY)); SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe); rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe)); MCA_SPML_CALL(fence()); SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); value = nlong; rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); if (OSHMEM_SUCCESS != rc) { break; } } } return rc; }