int mca_btl_self_add_procs( struct mca_btl_base_module_t* btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t* reachability ) { int i; for( i = 0; i < (int)nprocs; i++ ) { if( 0 == opal_compare_proc(procs[i]->proc_name, OPAL_PROC_MY_NAME) ) { opal_bitmap_set_bit( reachability, i ); break; /* there will always be only one ... */ } } return OPAL_SUCCESS; }
/** * PML->BTL notification of change in the process list. * PML->BTL Notification that a receive fragment has been matched. * Called for message that is send from process with the virtual * address of the shared memory segment being different than that of * the receiver. * * @param btl (IN) * @param proc (IN) * @param peer (OUT) * @return OPAL_SUCCESS or error status on failure. * */ static int mca_btl_self_add_procs (struct mca_btl_base_module_t *btl, size_t nprocs, struct opal_proc_t **procs, struct mca_btl_base_endpoint_t **peers, opal_bitmap_t* reachability) { for (int i = 0; i < (int)nprocs; i++ ) { if( 0 == opal_compare_proc(procs[i]->proc_name, OPAL_PROC_MY_NAME) ) { opal_bitmap_set_bit( reachability, i ); /* need to return something to keep the bml from ignoring us */ peers[i] = (struct mca_btl_base_endpoint_t *) 1; break; /* there will always be only one ... */ } } return OPAL_SUCCESS; }
/** * This PML monitors only the processes in the MPI_COMM_WORLD. As OMPI is now lazily * adding peers on the first call to add_procs we need to check how many processes * are in the MPI_COMM_WORLD to create the storage with the right size. */ int mca_pml_monitoring_add_procs(struct ompi_proc_t **procs, size_t nprocs) { opal_process_name_t tmp, wp_name; size_t i, peer_rank, nprocs_world; uint64_t key; if(NULL == translation_ht) { translation_ht = OBJ_NEW(opal_hash_table_t); opal_hash_table_init(translation_ht, 2048); /* get my rank in the MPI_COMM_WORLD */ my_rank = ompi_comm_rank((ompi_communicator_t*)&ompi_mpi_comm_world); } nprocs_world = ompi_comm_size((ompi_communicator_t*)&ompi_mpi_comm_world); /* For all procs in the same MPI_COMM_WORLD we need to add them to the hash table */ for( i = 0; i < nprocs; i++ ) { /* Extract the peer procname from the procs array */ if( ompi_proc_is_sentinel(procs[i]) ) { tmp = ompi_proc_sentinel_to_name((uintptr_t)procs[i]); } else { tmp = procs[i]->super.proc_name; } if( tmp.jobid != ompi_proc_local_proc->super.proc_name.jobid ) continue; for( peer_rank = 0; peer_rank < nprocs_world; peer_rank++ ) { wp_name = ompi_group_get_proc_name(((ompi_communicator_t*)&ompi_mpi_comm_world)->c_remote_group, peer_rank); if( 0 != opal_compare_proc( tmp, wp_name) ) continue; /* Find the rank of the peer in MPI_COMM_WORLD */ key = *((uint64_t*)&tmp); /* store the rank (in COMM_WORLD) of the process with its name (a uniq opal ID) as key in the hash table*/ if( OPAL_SUCCESS != opal_hash_table_set_value_uint64(translation_ht, key, (void*)(uintptr_t)peer_rank) ) { return OMPI_ERR_OUT_OF_RESOURCE; /* failed to allocate memory or growing the hash table */ } break; } } return pml_selected_module.pml_add_procs(procs, nprocs); }
/* * Receive the peers globally unique process identification from a newly * connected socket and verify the expected response. If so, move the * socket to a connected state. */ static int usock_recv_connect_ack(void) { char *msg; char *version; int rc; char *cred; size_t credsize; pmix_usock_hdr_t hdr; opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s RECV CONNECT ACK FROM SERVER ON SOCKET %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mca_pmix_native_component.sd); /* ensure all is zero'd */ memset(&hdr, 0, sizeof(pmix_usock_hdr_t)); if (usock_recv_blocking((char*)&hdr, sizeof(pmix_usock_hdr_t))) { /* If the state is CONNECT_ACK, then we were waiting for * the connection to be ack'd */ if (mca_pmix_native_component.state != PMIX_USOCK_CONNECT_ACK) { /* handshake broke down - abort this connection */ opal_output(0, "%s RECV CONNECT BAD HANDSHAKE FROM SERVER ON SOCKET %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mca_pmix_native_component.sd); mca_pmix_native_component.state = PMIX_USOCK_FAILED; CLOSE_THE_SOCKET(mca_pmix_native_component.sd); return OPAL_ERR_UNREACH; } } else { /* unable to complete the recv */ opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s unable to complete recv of connect-ack from server ON SOCKET %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mca_pmix_native_component.sd); return OPAL_ERR_UNREACH; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s connect-ack recvd from server", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* compare the servers name to the expected value */ if (0 != opal_compare_proc(hdr.id, mca_pmix_native_component.server)) { opal_output(0, "usock_peer_recv_connect_ack: " "%s received unexpected process identifier (%s) from server: expected (%s)", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), OPAL_NAME_PRINT(hdr.id), OPAL_NAME_PRINT(mca_pmix_native_component.server)); mca_pmix_native_component.state = PMIX_USOCK_FAILED; CLOSE_THE_SOCKET(mca_pmix_native_component.sd); return OPAL_ERR_UNREACH; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s connect-ack header from server is okay", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* get the authentication and version payload */ if (NULL == (msg = (char*)malloc(hdr.nbytes))) { mca_pmix_native_component.state = PMIX_USOCK_FAILED; CLOSE_THE_SOCKET(mca_pmix_native_component.sd); return OPAL_ERR_OUT_OF_RESOURCE; } if (!usock_recv_blocking(msg, hdr.nbytes)) { /* unable to complete the recv */ opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s unable to complete recv of connect-ack from server ON SOCKET %d", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), mca_pmix_native_component.sd); free(msg); return OPAL_ERR_UNREACH; } /* check that this is from a matching version */ version = (char*)(msg); if (0 != strcmp(version, opal_version_string)) { opal_output(0, "usock_peer_recv_connect_ack: " "%s received different version from server: %s instead of %s", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), version, opal_version_string); mca_pmix_native_component.state = PMIX_USOCK_FAILED; CLOSE_THE_SOCKET(mca_pmix_native_component.sd); free(msg); return OPAL_ERR_UNREACH; } opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s connect-ack version from server matches ours", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* check security token */ cred = (char*)(msg + strlen(version) + 1); credsize = hdr.nbytes - strlen(version) - 1; if (OPAL_SUCCESS != (rc = opal_sec.authenticate(cred, credsize, NULL))) { OPAL_ERROR_LOG(rc); mca_pmix_native_component.state = PMIX_USOCK_FAILED; CLOSE_THE_SOCKET(mca_pmix_native_component.sd); free(msg); return OPAL_ERR_UNREACH; } free(msg); opal_output_verbose(2, opal_pmix_base_framework.framework_output, "%s connect-ack from server authenticated", OPAL_NAME_PRINT(OPAL_PROC_MY_NAME)); /* connected */ mca_pmix_native_component.state = PMIX_USOCK_CONNECTED; /* initiate send of first message on queue */ if (NULL == mca_pmix_native_component.send_msg) { mca_pmix_native_component.send_msg = (pmix_usock_send_t*) opal_list_remove_first(&mca_pmix_native_component.send_queue); } if (NULL != mca_pmix_native_component.send_msg && !mca_pmix_native_component.send_ev_active) { opal_event_add(&mca_pmix_native_component.send_event, 0); mca_pmix_native_component.send_ev_active = true; } if (2 <= opal_output_get_verbosity(opal_pmix_base_framework.framework_output)) { pmix_usock_dump("connected"); } return OPAL_SUCCESS; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ mca_scoll_base_module_t * mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority) { mca_scoll_base_module_t *module; mca_scoll_mpi_module_t *mpi_module; int err, i; int tag; ompi_group_t* parent_group, *new_group; ompi_communicator_t* newcomm = NULL; *priority = 0; mca_scoll_mpi_component_t *cm; cm = &mca_scoll_mpi_component; int* ranks; if (!cm->mpi_enable){ return NULL; } if ((osh_group->proc_count < 2) || (osh_group->proc_count < cm->mpi_np)) { return NULL; } /* Create OMPI_Comm object and store ptr to it in group obj*/ if (NULL == oshmem_group_all) { osh_group->ompi_comm = &(ompi_mpi_comm_world.comm); } else { int my_rank = MPI_UNDEFINED; err = ompi_comm_group(&(ompi_mpi_comm_world.comm), &parent_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { return NULL; } ranks = (int*) malloc(osh_group->proc_count * sizeof(int)); if (OPAL_UNLIKELY(NULL == ranks)) { return NULL; } tag = 1; for (i = 0; i < osh_group->proc_count; i++) { ompi_proc_t* ompi_proc; for( int j = 0; j < ompi_group_size(parent_group); j++ ) { ompi_proc = ompi_group_peer_lookup(parent_group, j); if( 0 == opal_compare_proc(ompi_proc->super.proc_name, osh_group->proc_array[i]->super.proc_name)) { ranks[i] = j; break; } } /* NTH: keep track of my rank in the new group for the workaround below */ if (ranks[i] == ompi_comm_rank (&ompi_mpi_comm_world.comm)) { my_rank = i; } } err = ompi_group_incl(parent_group, osh_group->proc_count, ranks, &new_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } /* NTH: XXX -- WORKAROUND -- The oshmem code overwrites ompi_proc_local_proc with its * own proc but does not update the proc list in comm world or comm self. This causes * the code in ompi_group_incl that updates grp_my_rank to fail. This will cause failures * here and when an application attempts to mix oshmem and mpi so it will really need to * be fixed in oshmem/proc and not here. For now we need to work around a new jenkins * failure so set my group ranking so we do not crash when running ompi_comm_create_group. */ new_group->grp_my_rank = my_rank; err = ompi_comm_create_group(&(ompi_mpi_comm_world.comm), new_group, tag, &newcomm); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } err = ompi_group_free(&new_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } free(ranks); osh_group->ompi_comm = newcomm; } mpi_module = OBJ_NEW(mca_scoll_mpi_module_t); if (!mpi_module){ return NULL; } mpi_module->comm = osh_group->ompi_comm; mpi_module->super.scoll_module_enable = mca_scoll_mpi_module_enable; mpi_module->super.scoll_barrier = mca_scoll_mpi_barrier; mpi_module->super.scoll_broadcast = mca_scoll_mpi_broadcast; mpi_module->super.scoll_reduce = mca_scoll_mpi_reduce; mpi_module->super.scoll_collect = mca_scoll_mpi_collect; *priority = cm->mpi_priority; module = &mpi_module->super; return module; }
/* * Invoked when there's a new communicator that has been created. * Look at the communicator and decide which set of functions and * priority we want to return. */ mca_scoll_base_module_t * mca_scoll_mpi_comm_query(oshmem_group_t *osh_group, int *priority) { mca_scoll_base_module_t *module; mca_scoll_mpi_module_t *mpi_module; int err, i; int tag; ompi_group_t* parent_group, *new_group; ompi_communicator_t* newcomm = NULL; *priority = 0; mca_scoll_mpi_component_t *cm; cm = &mca_scoll_mpi_component; int* ranks; if (!cm->mpi_enable){ return NULL; } if ((osh_group->proc_count < 2) || (osh_group->proc_count < cm->mpi_np)) { return NULL; } /* Create OMPI_Comm object and store ptr to it in group obj*/ if (NULL == oshmem_group_all) { osh_group->ompi_comm = &(ompi_mpi_comm_world.comm); } else { err = ompi_comm_group(&(ompi_mpi_comm_world.comm), &parent_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { return NULL; } ranks = (int*) malloc(osh_group->proc_count * sizeof(int)); if (OPAL_UNLIKELY(NULL == ranks)) { return NULL; } tag = 1; for (i = 0; i < osh_group->proc_count; i++) { ompi_proc_t* ompi_proc; for( int j = 0; j < ompi_group_size(parent_group); j++ ) { ompi_proc = ompi_group_peer_lookup(parent_group, j); if( 0 == opal_compare_proc(ompi_proc->super.proc_name, osh_group->proc_array[i]->super.proc_name)) { ranks[i] = j; break; } } } err = ompi_group_incl(parent_group, osh_group->proc_count, ranks, &new_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } err = ompi_comm_create_group(&(ompi_mpi_comm_world.comm), new_group, tag, &newcomm); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } err = ompi_group_free(&new_group); if (OPAL_UNLIKELY(OMPI_SUCCESS != err)) { free(ranks); return NULL; } free(ranks); osh_group->ompi_comm = newcomm; } mpi_module = OBJ_NEW(mca_scoll_mpi_module_t); if (!mpi_module){ return NULL; } mpi_module->comm = osh_group->ompi_comm; mpi_module->super.scoll_module_enable = mca_scoll_mpi_module_enable; mpi_module->super.scoll_barrier = mca_scoll_mpi_barrier; mpi_module->super.scoll_broadcast = mca_scoll_mpi_broadcast; mpi_module->super.scoll_reduce = mca_scoll_mpi_reduce; mpi_module->super.scoll_collect = mca_scoll_mpi_collect; mpi_module->super.scoll_alltoall = NULL; *priority = cm->mpi_priority; module = &mpi_module->super; return module; }