int down_search(int rank, int parent, int me, int num_procs, int *num_children, opal_list_t *children, opal_bitmap_t *relatives) { int i, bitmap, peer, hibit, mask, found; orte_routed_tree_t *child; opal_bitmap_t *relations; /* is this me? */ if (me == rank) { bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { child = OBJ_NEW(orte_routed_tree_t); child->vpid = peer; if (NULL != children) { /* this is a direct child - add it to my list */ opal_list_append(children, &child->super); (*num_children)++; /* setup the relatives bitmap */ opal_bitmap_init(&child->relatives, num_procs); /* point to the relatives */ relations = &child->relatives; } else { /* we are recording someone's relatives - set the bit */ opal_bitmap_set_bit(relatives, peer); /* point to this relations */ relations = relatives; } /* search for this child's relatives */ down_search(0, 0, peer, num_procs, NULL, NULL, relations); } } return parent; } /* find the children of this rank */ bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { /* execute compute on this child */ if (0 <= (found = down_search(peer, rank, me, num_procs, num_children, children, relatives))) { return found; } } } return -1; }
ompi_communicator_t *ompi_comm_allocate ( int local_size, int remote_size ) { ompi_communicator_t *new_comm; /* create new communicator element */ new_comm = OBJ_NEW(ompi_communicator_t); new_comm->c_local_group = ompi_group_allocate ( local_size ); if ( 0 < remote_size ) { new_comm->c_remote_group = ompi_group_allocate (remote_size); new_comm->c_flags |= OMPI_COMM_INTER; } else { /* * simplifies some operations (e.g. p2p), if * we can always use the remote group */ new_comm->c_remote_group = new_comm->c_local_group; OBJ_RETAIN(new_comm->c_remote_group); } /* fill in the inscribing hyper-cube dimensions */ new_comm->c_cube_dim = opal_cube_dim(local_size); return new_comm; }
static int mca_oob_xcast_binomial_tree(orte_jobid_t job, bool process_first, orte_buffer_t* buffer, orte_gpr_trigger_cb_fn_t cbfunc) { orte_std_cntr_t i; int rc; int tag = ORTE_RML_TAG_XCAST; int peer, size, rank, hibit, mask; orte_buffer_t rbuf, sbuf; orte_gpr_notify_message_t *msg; orte_process_name_t target; /* check to see if there is something to send - this is only true on the HNP end. * However, we cannot just test to see if we are the HNP since, if we are a singleton, * we are the HNP *and* we still need to handle both ends of the xcast */ if (NULL != buffer) { /* this is the HNP end, so it starts the procedure. Accordingly, it sends its * message to the first process in the job in the peer list, which takes it from there */ OBJ_CONSTRUCT(&xcastmutex, opal_mutex_t); OPAL_THREAD_LOCK(&xcastmutex); target.cellid = ORTE_PROC_MY_NAME->cellid; target.jobid = job; target.vpid = 0; if (0 > (rc = mca_oob_send_packed(&target, buffer, tag, 0))) { ORTE_ERROR_LOG(rc); OPAL_THREAD_UNLOCK(&xcastmutex); OBJ_DESTRUCT(&xcastmutex); return rc; } OPAL_THREAD_UNLOCK(&xcastmutex); OBJ_DESTRUCT(&xcastmutex); return ORTE_SUCCESS; } /* this process is one of the application procs - accordingly, it will * receive the message from its "parent" in the broadcast tree, and * then send it along to some set of children */ /* compute the bitmap, if we haven't already done so */ if (!bitmap_init) { bitmap_save = opal_cube_dim((int)orte_process_info.num_procs); bitmap_init = true; } xcast_bitmap = bitmap_save; rank = (int)(ORTE_PROC_MY_NAME->vpid); size = (int)orte_process_info.num_procs; hibit = opal_hibit(rank, xcast_bitmap); --xcast_bitmap; /* regardless of who we are, we first have to receive the message */ OBJ_CONSTRUCT(&rbuf, orte_buffer_t); if (0 > (rc = mca_oob_recv_packed(ORTE_NAME_WILDCARD, &rbuf, tag))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&rbuf); return rc; } msg = OBJ_NEW(orte_gpr_notify_message_t); if (NULL == msg) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } i=1; if (ORTE_SUCCESS != (rc = orte_dss.unpack(&rbuf, &msg, &i, ORTE_GPR_NOTIFY_MSG))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(msg); return rc; } OBJ_DESTRUCT(&rbuf); /* repack the message so we can send it on */ OBJ_CONSTRUCT(&sbuf, orte_buffer_t); if (ORTE_SUCCESS != (rc = orte_dss.pack(&sbuf, &msg, 1, ORTE_GPR_NOTIFY_MSG))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&sbuf); return rc; } /* since the OOB contact info for our peers is in the STG1 message, we have to * process it BEFORE we can relay the message to any "children" */ if (cbfunc != NULL && process_first) { /* process the message */ cbfunc(msg); } /* send data to any children */ target.cellid = ORTE_PROC_MY_NAME->cellid; target.jobid = ORTE_PROC_MY_NAME->jobid; for (i = hibit + 1, mask = 1 << i; i <= xcast_bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < size) { target.vpid = (orte_vpid_t)peer; if (0 > (rc = mca_oob_send_packed(&target, &sbuf, tag, 0))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(msg); return rc; } } } OBJ_DESTRUCT(&sbuf); /* if it wasn't the STG1 message, then process it here */ if (cbfunc != NULL && !process_first) { cbfunc(msg); } OBJ_RELEASE(msg); return ORTE_SUCCESS; }
static int binomial_tree(int rank, int parent, int me, int num_procs, int *nchildren, opal_list_t *childrn, opal_bitmap_t *relatives, bool mine) { int i, bitmap, peer, hibit, mask, found; orte_routed_tree_t *child; opal_bitmap_t *relations; OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output, "%s routed:binomial rank %d parent %d me %d num_procs %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank, parent, me, num_procs)); /* is this me? */ if (me == rank) { bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; if (peer < num_procs) { child = OBJ_NEW(orte_routed_tree_t); child->vpid = peer; OPAL_OUTPUT_VERBOSE((3, orte_routed_base_framework.framework_output, "%s routed:binomial %d found child %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank, ORTE_VPID_PRINT(child->vpid))); if (mine) { /* this is a direct child - add it to my list */ opal_list_append(childrn, &child->super); (*nchildren)++; /* setup the relatives bitmap */ opal_bitmap_init(&child->relatives, num_procs); /* point to the relatives */ relations = &child->relatives; } else { /* we are recording someone's relatives - set the bit */ opal_bitmap_set_bit(relatives, peer); /* point to this relations */ relations = relatives; } /* search for this child's relatives */ binomial_tree(0, 0, peer, num_procs, nchildren, childrn, relations, false); } } return parent; } /* find the children of this rank */ OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children of rank %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rank)); bitmap = opal_cube_dim(num_procs); hibit = opal_hibit(rank, bitmap); --bitmap; for (i = hibit + 1, mask = 1 << i; i <= bitmap; ++i, mask <<= 1) { peer = rank | mask; OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children checking peer %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), peer)); if (peer < num_procs) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children computing tree", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* execute compute on this child */ if (0 <= (found = binomial_tree(peer, rank, me, num_procs, nchildren, childrn, relatives, mine))) { OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s routed:binomial find children returning found value %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), found)); return found; } } } return -1; }
/** adapted from openmpi see, ~/openmpi-1.6.4/ompi/mca/coll/basic/coll_basic_bcast.c This does a binomial tree based broadcast. */ int MPI_Bcast_log( void *buff, int count, MPI_Datatype datatype, int root, MPI_Comm comm ) { int i; int size; int rank; int vrank; int peer; int dim; int hibit; int mask; int err; int rcnt=0; MPI_Request *areqs = NULL; MPI_CHECK( err = MPI_Comm_rank( comm, &rank ) ); MPI_CHECK( err = MPI_Comm_size( comm, &size ) ); vrank = (rank + size - root) % size; dim = opal_cube_dim(size); hibit = opal_hibit(vrank, dim); --dim; /* malloc the maximum possible send request, this is the dimension minus the depth then one more the leaf, this might allocate a more slots than necessary if the number of nodes do not evenly divide into the tree. */ NULL_CHECK( areqs = malloc( sizeof( MPI_Request ) * ( dim-(hibit+1)+1 ) ) ); /* Receive data from parent in the tree. */ if (vrank > 0) { peer = ((vrank & ~(1 << hibit)) + root) % size; // printf("Getting ready to receive from %d at %d\n", peer, rank); MPI_CHECK( err = MPI_Recv( buff, count, datatype, peer, BCAST_TAG, comm, MPI_STATUS_IGNORE ) ); if (MPI_SUCCESS != err) { return err; } } /* Send data to the children. */ // just in case there are no children, return should be success err = MPI_SUCCESS; for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer = vrank | mask; if (peer < size) { peer = (peer + root) % size; MPI_CHECK( err = MPI_Isend( buff, count, datatype, peer, BCAST_TAG, comm, &areqs[ rcnt ] ) ); rcnt++; if (MPI_SUCCESS != err) { free(areqs); return err; } } } /* Wait on all requests. */ if (rcnt > 0) { MPI_CHECK( err = MPI_Waitall( rcnt, areqs, MPI_STATUSES_IGNORE ) ); /* Free the reqs */ free(areqs); } /* All done */ return err; }
/* * Initialize comm world/self/null/parent. */ int ompi_comm_init(void) { ompi_group_t *group; size_t size; /* Setup communicator array */ OBJ_CONSTRUCT(&ompi_mpi_communicators, opal_pointer_array_t); if( OPAL_SUCCESS != opal_pointer_array_init(&ompi_mpi_communicators, 0, OMPI_FORTRAN_HANDLE_MAX, 64) ) { return OMPI_ERROR; } /* Setup MPI_COMM_WORLD */ OBJ_CONSTRUCT(&ompi_mpi_comm_world, ompi_communicator_t); group = OBJ_NEW(ompi_group_t); group->grp_proc_pointers = ompi_proc_world(&size); group->grp_proc_count = (int)size; OMPI_GROUP_SET_INTRINSIC (group); OMPI_GROUP_SET_DENSE (group); ompi_set_group_rank(group, ompi_proc_local()); ompi_group_increment_proc_count (group); ompi_mpi_comm_world.comm.c_contextid = 0; ompi_mpi_comm_world.comm.c_id_start_index = 4; ompi_mpi_comm_world.comm.c_id_available = 4; ompi_mpi_comm_world.comm.c_f_to_c_index = 0; ompi_mpi_comm_world.comm.c_my_rank = group->grp_my_rank; ompi_mpi_comm_world.comm.c_local_group = group; ompi_mpi_comm_world.comm.c_remote_group = group; OBJ_RETAIN(ompi_mpi_comm_world.comm.c_remote_group); ompi_mpi_comm_world.comm.c_cube_dim = opal_cube_dim((int)size); ompi_mpi_comm_world.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_world.comm); opal_pointer_array_set_item (&ompi_mpi_communicators, 0, &ompi_mpi_comm_world); MEMCHECKER (memset (ompi_mpi_comm_world.comm.c_name, 0, MPI_MAX_OBJECT_NAME)); strncpy (ompi_mpi_comm_world.comm.c_name, "MPI_COMM_WORLD", strlen("MPI_COMM_WORLD")+1 ); ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_NAMEISSET; ompi_mpi_comm_world.comm.c_flags |= OMPI_COMM_INTRINSIC; /* We have to create a hash (although it is legal to leave this filed NULL -- the attribute accessor functions will intepret this as "there are no attributes cached on this object") because MPI_COMM_WORLD has some predefined attributes. */ ompi_attr_hash_init(&ompi_mpi_comm_world.comm.c_keyhash); /* Setup MPI_COMM_SELF */ OBJ_CONSTRUCT(&ompi_mpi_comm_self, ompi_communicator_t); group = OBJ_NEW(ompi_group_t); group->grp_proc_pointers = ompi_proc_self(&size); group->grp_my_rank = 0; group->grp_proc_count = (int)size; OMPI_GROUP_SET_INTRINSIC (group); OMPI_GROUP_SET_DENSE (group); ompi_mpi_comm_self.comm.c_contextid = 1; ompi_mpi_comm_self.comm.c_f_to_c_index = 1; ompi_mpi_comm_self.comm.c_id_start_index = 20; ompi_mpi_comm_self.comm.c_id_available = 20; ompi_mpi_comm_self.comm.c_my_rank = group->grp_my_rank; ompi_mpi_comm_self.comm.c_local_group = group; ompi_mpi_comm_self.comm.c_remote_group = group; OBJ_RETAIN(ompi_mpi_comm_self.comm.c_remote_group); ompi_mpi_comm_self.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); OMPI_COMM_SET_PML_ADDED(&ompi_mpi_comm_self.comm); opal_pointer_array_set_item (&ompi_mpi_communicators, 1, &ompi_mpi_comm_self); MEMCHECKER (memset (ompi_mpi_comm_self.comm.c_name, 0, MPI_MAX_OBJECT_NAME)); strncpy(ompi_mpi_comm_self.comm.c_name,"MPI_COMM_SELF",strlen("MPI_COMM_SELF")+1); ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_NAMEISSET; ompi_mpi_comm_self.comm.c_flags |= OMPI_COMM_INTRINSIC; /* We can set MPI_COMM_SELF's keyhash to NULL because it has no predefined attributes. If a user defines an attribute on MPI_COMM_SELF, the keyhash will automatically be created. */ ompi_mpi_comm_self.comm.c_keyhash = NULL; /* Setup MPI_COMM_NULL */ OBJ_CONSTRUCT(&ompi_mpi_comm_null, ompi_communicator_t); ompi_mpi_comm_null.comm.c_local_group = &ompi_mpi_group_null.group; ompi_mpi_comm_null.comm.c_remote_group = &ompi_mpi_group_null.group; OBJ_RETAIN(&ompi_mpi_group_null.group); OBJ_RETAIN(&ompi_mpi_group_null.group); ompi_mpi_comm_null.comm.c_contextid = 2; ompi_mpi_comm_null.comm.c_f_to_c_index = 2; ompi_mpi_comm_null.comm.c_my_rank = MPI_PROC_NULL; ompi_mpi_comm_null.comm.error_handler = &ompi_mpi_errors_are_fatal.eh; OBJ_RETAIN( &ompi_mpi_errors_are_fatal.eh ); opal_pointer_array_set_item (&ompi_mpi_communicators, 2, &ompi_mpi_comm_null); MEMCHECKER (memset (ompi_mpi_comm_null.comm.c_name, 0, MPI_MAX_OBJECT_NAME)); strncpy(ompi_mpi_comm_null.comm.c_name,"MPI_COMM_NULL",strlen("MPI_COMM_NULL")+1); ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_NAMEISSET; ompi_mpi_comm_null.comm.c_flags |= OMPI_COMM_INTRINSIC; /* Initialize the parent communicator to MPI_COMM_NULL */ ompi_mpi_comm_parent = &ompi_mpi_comm_null.comm; OBJ_RETAIN(&ompi_mpi_comm_null); OBJ_RETAIN(&ompi_mpi_group_null.group); OBJ_RETAIN(&ompi_mpi_errors_are_fatal.eh); /* initialize the comm_reg stuff for multi-threaded comm_cid allocation */ ompi_comm_reg_init(); return OMPI_SUCCESS; }
/* The Binomial Spanning Tree algorithm. Outlay: The game scales with log2(NP) and uses 1 byte of memory. */ static int __algorithm_binomial_tree(struct oshmem_group_t *group, int PE_root, void *target, const void *source, size_t nlong, long *pSync) { int rc = OSHMEM_SUCCESS; long value = SHMEM_SYNC_INIT; int root_id = oshmem_proc_group_find_id(group, PE_root); int my_id = oshmem_proc_group_find_id(group, group->my_pe); int peer_id = 0; int peer_pe = 0; int vrank; int dim = opal_cube_dim(group->proc_count); int hibit; int mask; int i = 0; SCOLL_VERBOSE(12, "[#%d] Broadcast algorithm: Tree", group->my_pe); SCOLL_VERBOSE(15, "[#%d] pSync[0] = %ld root = #%d", group->my_pe, pSync[0], PE_root); vrank = (my_id + group->proc_count - root_id) % group->proc_count; hibit = opal_hibit(vrank, dim); SCOLL_VERBOSE(15, "[#%d] dim = %d vrank = %d hibit = %d", group->my_pe, dim, vrank, hibit); dim--; pSync[0] = SHMEM_SYNC_READY; /* Receive data from parent in the tree. */ if (vrank > 0) { value = SHMEM_SYNC_READY; SCOLL_VERBOSE(14, "[#%d] wait", group->my_pe); rc = MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); while ((value = pSync[0]) < 0) { SCOLL_VERBOSE(14, "[#%d] Broadcast size is a negative value (%li)\n", group->my_pe, pSync[0]); MCA_SPML_CALL(wait((void*)pSync, SHMEM_CMP_NE, (void*)&value, SHMEM_LONG)); } if (OSHMEM_SUCCESS != rc) { return rc; } nlong = (size_t) pSync[0]; } /* Send data to the children. */ for (i = hibit + 1, mask = 1 << i; i <= dim; ++i, mask <<= 1) { peer_id = vrank | mask; if (peer_id < group->proc_count) { /* Wait for the child to be ready to receive (pSync must have the initial value) */ peer_id = (peer_id + root_id) % group->proc_count; peer_pe = oshmem_proc_pe(group->proc_array[peer_id]); SCOLL_VERBOSE(14, "[#%d] check remote pe is ready to receive #%d", group->my_pe, peer_pe); do { rc = MCA_SPML_CALL(get((void*)pSync, sizeof(long), (void*)pSync, peer_pe)); } while ((OSHMEM_SUCCESS == rc) && (pSync[0] != SHMEM_SYNC_READY)); SCOLL_VERBOSE(14, "[#%d] send data to #%d", group->my_pe, peer_pe); rc = MCA_SPML_CALL(put(target, nlong, (my_id == root_id ? (void *)source : target), peer_pe)); MCA_SPML_CALL(fence()); SCOLL_VERBOSE(14, "[#%d] signals to #%d", group->my_pe, peer_pe); value = nlong; rc = MCA_SPML_CALL(put((void*)pSync, sizeof(value), (void*)&value, peer_pe)); if (OSHMEM_SUCCESS != rc) { break; } } } return rc; }