/* self communication optimizations */ static inline int ompi_osc_rdma_put_self (void *source, int source_count, ompi_datatype_t *source_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ if (module->sc_group && !module->active_eager_send_active) { OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_post_msgs) { opal_condition_wait(&module->cond, &module->lock); } OPAL_THREAD_UNLOCK(&module->lock); } if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (request) { ompi_osc_rdma_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_cas_self (void *source, void *compare, void *result, ompi_datatype_t *datatype, OPAL_PTRDIFF_TYPE target_disp, ompi_osc_rdma_module_t *module) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); /* if we are in active target mode wait until all post messages arrive */ if (module->sc_group && !module->active_eager_send_active) { OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_post_msgs) { opal_condition_wait(&module->cond, &module->lock); } OPAL_THREAD_UNLOCK(&module->lock); } if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } ompi_osc_rdma_accumulate_lock (module); memcpy (result, target, datatype->super.size); if (0 == memcmp (compare, target, datatype->super.size)) { memcpy (target, source, datatype->super.size); } ompi_osc_rdma_accumulate_unlock (module); return OMPI_SUCCESS; }
/* * User-level call to detach buffer */ int mca_pml_base_bsend_detach(void* addr, int* size) { OPAL_THREAD_LOCK(&mca_pml_bsend_mutex); /* is buffer attached */ if(NULL == mca_pml_bsend_allocator) { OPAL_THREAD_UNLOCK(&mca_pml_bsend_mutex); return OMPI_ERR_BUFFER; } /* wait on any pending requests */ while(mca_pml_bsend_count != 0) opal_condition_wait(&mca_pml_bsend_condition, &mca_pml_bsend_mutex); /* free resources associated with the allocator */ mca_pml_bsend_allocator->alc_finalize(mca_pml_bsend_allocator); mca_pml_bsend_allocator = NULL; /* return current settings */ if(NULL != addr) *((void**)addr) = mca_pml_bsend_userbase; if(NULL != size) *size = (int)mca_pml_bsend_usersize; /* reset local variables */ mca_pml_bsend_userbase = NULL; mca_pml_bsend_usersize = 0; mca_pml_bsend_base = NULL; mca_pml_bsend_addr = NULL; mca_pml_bsend_size = 0; mca_pml_bsend_count = 0; OPAL_THREAD_UNLOCK(&mca_pml_bsend_mutex); return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_gacc_self (void *source, int source_count, ompi_datatype_t *source_datatype, void *result, int result_count, ompi_datatype_t *result_datatype, OPAL_PTRDIFF_TYPE target_disp, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { void *target = (unsigned char*) module->baseptr + ((unsigned long) target_disp * module->disp_unit); int ret; /* if we are in active target mode wait until all post messages arrive */ if (module->sc_group && !module->active_eager_send_active) { OPAL_THREAD_LOCK(&module->lock); while (0 != module->num_post_msgs) { opal_condition_wait(&module->cond, &module->lock); } OPAL_THREAD_UNLOCK(&module->lock); } if (!(module->passive_target_access_epoch || module->active_eager_send_active)) { return OMPI_ERR_RMA_SYNC; } ompi_osc_rdma_accumulate_lock (module); do { ret = ompi_datatype_sndrcv (target, target_count, target_datatype, result, result_count, result_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed copying to the target buffer. ret = %d", ret)); break; } if (&ompi_mpi_op_no_op.op != op) { if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source, source_count, source_datatype, target, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv (source, source_count, source_datatype, target, target_count, target_datatype); } } if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((5, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret)); break; } } while (0); ompi_osc_rdma_accumulate_unlock (module); if (request) { /* NTH: is it ok to use an ompi error code here? */ ompi_osc_rdma_request_complete (request, ret); } return OMPI_SUCCESS; }
static int memheap_oob_get_mkeys(int pe, uint32_t seg, sshmem_mkey_t *mkeys) { opal_buffer_t *msg; uint8_t cmd; int i; int rc; if (OSHMEM_SUCCESS == MCA_SPML_CALL(oob_get_mkeys(pe, seg, mkeys))) { for (i = 0; i < memheap_map->num_transports; i++) { mkeys[i].va_base = mca_memheap_seg2base_va(seg); MEMHEAP_VERBOSE(5, "MKEY CALCULATED BY LOCAL SPML: pe: %d tr_id: %d %s", pe, i, mca_spml_base_mkey2str(&mkeys[i])); } return OSHMEM_SUCCESS; } OPAL_THREAD_LOCK(&memheap_oob.lck); memheap_oob.mkeys = mkeys; memheap_oob.mkeys_rcvd = 0; msg = OBJ_NEW(opal_buffer_t); if (!msg) { OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("failed to get msg buffer"); return OSHMEM_ERROR; } OPAL_THREAD_LOCK(&memheap_oob.lck); cmd = MEMHEAP_RKEY_REQ; opal_dss.pack(msg, &cmd, 1, OPAL_UINT8); opal_dss.pack(msg, &seg, 1, OPAL_UINT32); rc = send_buffer(pe, msg); if (MPI_SUCCESS != rc) { OPAL_THREAD_UNLOCK(&memheap_oob.lck); MEMHEAP_ERROR("FAILED to send rml message %d", rc); return OSHMEM_ERROR; } while (!memheap_oob.mkeys_rcvd) { opal_condition_wait(&memheap_oob.cond, &memheap_oob.lck); } if (MEMHEAP_RKEY_RESP == memheap_oob.mkeys_rcvd) { rc = OSHMEM_SUCCESS; } else { MEMHEAP_ERROR("failed to get rkey seg#%d pe=%d", seg, pe); rc = OSHMEM_ERROR; } OPAL_THREAD_UNLOCK(&memheap_oob.lck); return rc; }
static void* thr2_run(opal_object_t* obj) { int i; clock_t c1, c2; opal_mutex_lock(&mutex); c1 = clock(); for(i=0; i<TEST_COUNT; i++) { opal_condition_signal(&thr1_cond); opal_condition_wait(&thr2_cond, &mutex); thr2_count++; } c2 = clock(); opal_mutex_unlock(&mutex); fprintf(stderr, "thr2: time per iteration: %ld usec\n", (long)((c2 - c1) / TEST_COUNT)); return NULL; }
int orte_rml_oob_recv_buffer(orte_process_name_t* peer, opal_buffer_t *buf, orte_rml_tag_t tag, int flags) { orte_rml_oob_msg_t *msg = OBJ_NEW(orte_rml_oob_msg_t); int ret; msg->msg_type = ORTE_RML_BLOCKING_RECV; flags |= (ORTE_RML_FLAG_RECURSIVE_CALLBACK | ORTE_RML_ALLOC); msg->msg_data = (struct iovec *) malloc(sizeof(struct iovec) * 2); msg->msg_data[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_header; msg->msg_data[0].iov_len = sizeof(orte_rml_oob_msg_header_t); msg->msg_data[1].iov_base = NULL; msg->msg_data[1].iov_len = 0; ret = orte_rml_oob_module.active_oob->oob_recv_nb(peer, msg->msg_data, 2, tag, flags, orte_rml_recv_msg_callback, msg); if (ret < 0) goto cleanup; OPAL_THREAD_LOCK(&msg->msg_lock); while (!msg->msg_complete) { opal_condition_wait(&msg->msg_cond, &msg->msg_lock); } ret = msg->msg_status; OPAL_THREAD_UNLOCK(&msg->msg_lock); if (ret > 0) { ret = opal_dss.load(buf, msg->msg_data[1].iov_base, msg->msg_data[1].iov_len); } cleanup: OBJ_RELEASE(msg); return ret; }
int orte_rml_oob_recv(orte_process_name_t* peer, struct iovec *iov, int count, orte_rml_tag_t tag, int flags) { orte_rml_oob_msg_t *msg = OBJ_NEW(orte_rml_oob_msg_t); int ret; int i; msg->msg_type = ORTE_RML_BLOCKING_RECV; flags |= ORTE_RML_FLAG_RECURSIVE_CALLBACK; msg->msg_data = (struct iovec *) malloc(sizeof(struct iovec) * (count + 1)); msg->msg_data[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_header; msg->msg_data[0].iov_len = sizeof(orte_rml_oob_msg_header_t); for (i = 0 ; i < count ; ++i) { msg->msg_data[i + 1].iov_base = iov[i].iov_base; msg->msg_data[i + 1].iov_len = iov[i].iov_len; } ret = orte_rml_oob_module.active_oob->oob_recv_nb(peer, msg->msg_data, count + 1, tag, flags, orte_rml_recv_msg_callback, msg); if (ret < 0) goto cleanup; OPAL_THREAD_LOCK(&msg->msg_lock); while (!msg->msg_complete) { opal_condition_wait(&msg->msg_cond, &msg->msg_lock); } ret = msg->msg_status; OPAL_THREAD_UNLOCK(&msg->msg_lock); cleanup: OBJ_RELEASE(msg); if (ret > 0) { ret -= sizeof(struct orte_rml_oob_msg_header_t); } return ret; }
int mca_oob_ud_msg_wait (mca_oob_ud_msg_t *msg) { OPAL_THREAD_LOCK(&msg->lock); /* wait for ack */ while (MCA_OOB_UD_MSG_STATUS_POSTED == msg->status) { opal_condition_wait (&msg->status_changed, &msg->lock); } OPAL_THREAD_UNLOCK(&msg->lock); switch (msg->status) { case MCA_OOB_UD_MSG_STATUS_TIMEOUT: return ORTE_ERR_TIMEOUT; case MCA_OOB_UD_MSG_STATUS_COMPLETE: return ORTE_SUCCESS; case MCA_OOB_UD_MSG_STATUS_ERROR: default: return ORTE_ERROR; } }
int ompi_osc_pt2pt_module_wait(ompi_win_t *win) { ompi_group_t *group; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); OPAL_THREAD_LOCK(&module->p2p_lock); while (0 != (module->p2p_num_pending_in) || 0 != (module->p2p_num_complete_msgs)) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } group = module->p2p_pw_group; module->p2p_pw_group = NULL; OPAL_THREAD_UNLOCK(&module->p2p_lock); ompi_win_remove_mode(win, OMPI_WIN_EXPOSE_EPOCH | OMPI_WIN_POSTED); ompi_group_decrement_proc_count(group); OBJ_RELEASE(group); return OMPI_SUCCESS; }
int mca_oob_tcp_recv_cancel( orte_process_name_t* name, int tag) { int matched = 0; opal_list_item_t *item, *next; /* wait for any previously matched messages to be processed */ OPAL_THREAD_LOCK(&mca_oob_tcp_component.tcp_match_lock); #if OMPI_ENABLE_PROGRESS_THREADS if(opal_event_progress_thread() == false) { while(mca_oob_tcp_component.tcp_match_count) { opal_condition_wait( &mca_oob_tcp_component.tcp_match_cond, &mca_oob_tcp_component.tcp_match_lock); } } #endif /* remove any matching posted receives */ for(item = opal_list_get_first(&mca_oob_tcp_component.tcp_msg_post); item != opal_list_get_end(&mca_oob_tcp_component.tcp_msg_post); item = next) { mca_oob_tcp_msg_t* msg = (mca_oob_tcp_msg_t*)item; next = opal_list_get_next(item); if (OPAL_EQUAL == opal_dss.compare(name, &msg->msg_peer, ORTE_NAME)) { if (msg->msg_hdr.msg_tag == tag) { opal_list_remove_item(&mca_oob_tcp_component.tcp_msg_post, &msg->super.super); MCA_OOB_TCP_MSG_RETURN(msg); matched++; } } } OPAL_THREAD_UNLOCK(&mca_oob_tcp_component.tcp_match_lock); return (matched > 0) ? ORTE_SUCCESS : ORTE_ERR_NOT_FOUND; }
static int orte_pls_rsh_launch_threaded(orte_jobid_t jobid) { struct timeval tv = { 0, 0 }; struct opal_event event; struct orte_pls_rsh_stack_t stack; OBJ_CONSTRUCT(&stack, orte_pls_rsh_stack_t); stack.jobid = jobid; if( opal_event_progress_thread() ) { stack.rc = orte_pls_rsh_launch( jobid ); } else { opal_evtimer_set(&event, orte_pls_rsh_launch_cb, &stack); opal_evtimer_add(&event, &tv); OPAL_THREAD_LOCK(&stack.mutex); while (stack.complete == false) { opal_condition_wait(&stack.cond, &stack.mutex); } OPAL_THREAD_UNLOCK(&stack.mutex); } OBJ_DESTRUCT(&stack); return stack.rc; }
/***************************** * Local Function Definitions *****************************/ static int blcr_checkpoint_peer(pid_t pid, char * local_dir, char ** fname) { char **cr_argv = NULL; char *cr_cmd = NULL; int ret; pid_t child_pid; int exit_status = OPAL_SUCCESS; int status, child_status; opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint_peer(%d, --)", pid); /* * Get the checkpoint command */ if ( OPAL_SUCCESS != (ret = opal_crs_blcr_checkpoint_cmd(pid, local_dir, fname, &cr_cmd)) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint_peer: Failed to generate checkpoint command :(%d):", ret); exit_status = ret; goto cleanup; } if ( NULL == (cr_argv = opal_argv_split(cr_cmd, ' ')) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: checkpoint_peer: Failed to opal_argv_split :(%d):", ret); exit_status = OPAL_ERROR; goto cleanup; } /* * Fork a child to do the checkpoint */ blcr_current_state = OPAL_CRS_CHECKPOINT; child_pid = fork(); if(0 == child_pid) { /* Child Process */ opal_output_verbose(10, mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: exec :(%s, %s):", strdup(blcr_checkpoint_cmd), opal_argv_join(cr_argv, ' ')); status = execvp(strdup(blcr_checkpoint_cmd), cr_argv); if(status < 0) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: Child failed to execute :(%d):", status); } opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: execvp returned %d", status); } else if(child_pid > 0) { /* Don't waitpid here since we don't really want to restart from inside waitpid ;) */ while(OPAL_CRS_RESTART != blcr_current_state && OPAL_CRS_CONTINUE != blcr_current_state ) { OPAL_THREAD_LOCK(&blcr_lock); opal_condition_wait(&blcr_cond, &blcr_lock); OPAL_THREAD_UNLOCK(&blcr_lock); } opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: Thread finished with status %d", blcr_current_state); if(OPAL_CRS_CONTINUE == blcr_current_state) { /* Wait for the child only if we are continuing */ if( 0 > waitpid(child_pid, &child_status, 0) ) { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: waitpid returned %d", child_status); } } } else { opal_output(mca_crs_blcr_component.super.output_handle, "crs:blcr: blcr_checkpoint_peer: fork failed :(%d):", child_pid); } /* * Cleanup */ cleanup: if(NULL != cr_cmd) free(cr_cmd); if(NULL != cr_argv) opal_argv_free(cr_argv); return exit_status; }
static int hier_allgather(opal_buffer_t *sbuf, opal_buffer_t *rbuf) { int rc=ORTE_SUCCESS; opal_list_item_t *item; orte_namelist_t *nm; opal_buffer_t tmp_buf; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier entering allgather", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* have I initialized my local info? */ if (!coll_initialized) { orte_process_name_t proc; orte_vpid_t v; /* get my local rank so I can locally cache it */ my_local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME); /* if I am local_rank=0 for this node and job, then setup * my array of local_rank=0 peers */ if (0 == my_local_rank) { /* we need one entry/node in this job */ my_coll_peers = (orte_vpid_t*)malloc(orte_process_info.num_nodes * sizeof(orte_vpid_t)); cpeers = 0; } /* cycle through the procs to create a list of those that are local to me */ proc.jobid = ORTE_PROC_MY_NAME->jobid; for (v=0; v < orte_process_info.num_procs; v++) { proc.vpid = v; ORTE_EPOCH_SET(proc.epoch,orte_util_lookup_epoch(&proc)); /* is this proc local_rank=0 on its node? */ if (0 == my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_coll_peers[cpeers++] = v; } /* if this is me, or this proc isn't on our node, ignore it */ if (v == ORTE_PROC_MY_NAME->vpid || !OPAL_PROC_ON_LOCAL_NODE(orte_ess.proc_get_locality(&proc))) { continue; } /* add this proc to our list of local peers */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = proc.jobid; nm->name.vpid = proc.vpid; ORTE_EPOCH_SET(nm->name.epoch,proc.epoch); opal_list_append(&my_local_peers, &nm->item); /* if I am not local_rank=0, is this one? */ if (0 != my_local_rank && 0 == orte_ess.get_local_rank(&proc)) { my_local_rank_zero_proc.jobid = proc.jobid; my_local_rank_zero_proc.vpid = proc.vpid; ORTE_EPOCH_SET(my_local_rank_zero_proc.epoch,proc.epoch); } } /* compute the number of local peers - note that this number * does not include me!! */ num_local_peers = opal_list_get_size(&my_local_peers); /* flag that I have initialized things */ coll_initialized = true; } /* if I am not local rank = 0 */ if (0 != my_local_rank) { if (ORTE_VPID_INVALID == my_local_rank_zero_proc.vpid) { /* something is broken */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); OPAL_THREAD_UNLOCK(&allgather.lock); /* send our data to the local_rank=0 proc on this node */ if (0 > (rc = orte_rml.send_buffer(&my_local_rank_zero_proc, sbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } /* now receive the final result. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_NON_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we will receive a single message * sent from our local_rank=0 peer */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < 1) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* copy payload to the caller's buffer */ if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(rbuf, &allgather.results))) { ORTE_ERROR_LOG(rc); } OPAL_THREAD_UNLOCK(&allgather.lock); } else { /* I am local_rank = 0 on this node! */ /* setup the collective */ OPAL_THREAD_LOCK(&allgather.lock); allgather.recvd = 0; /* reset the collector */ OBJ_DESTRUCT(&allgather.results); OBJ_CONSTRUCT(&allgather.results, opal_buffer_t); /* seed with my data */ opal_dss.copy_payload(&allgather.results, sbuf); OPAL_THREAD_UNLOCK(&allgather.lock); /* wait to receive their data. Be sure to do this in * a manner that allows us to return without being in a recv! */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER, ORTE_RML_PERSISTENT, allgather_recv, &allgather); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait to complete - we need to receive input from every * local peer (excluding myself) */ OPAL_THREAD_LOCK(&allgather.lock); while (allgather.recvd < num_local_peers) { opal_condition_wait(&allgather.cond, &allgather.lock); } /* xfer to the tmp buf in case another allgather comes along */ OBJ_CONSTRUCT(&tmp_buf, opal_buffer_t); opal_dss.copy_payload(&tmp_buf, &allgather.results); OPAL_THREAD_UNLOCK(&allgather.lock); /* cancel the lingering recv */ orte_rml.recv_cancel(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ALLGATHER); /* take the recv'd data and use one of the base collectives * to exchange it with all other local_rank=0 procs in a scalable * manner - the exact collective will depend upon the number of * nodes in the job */ if (ORTE_SUCCESS != (rc = orte_grpcomm_base_allgather(&tmp_buf, rbuf, num_local_peers + 1, ORTE_PROC_MY_NAME->jobid, cpeers, my_coll_peers))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&tmp_buf); return rc; } OBJ_DESTRUCT(&tmp_buf); /* done with this */ /* distribute the results to our local peers */ for (item = opal_list_get_first(&my_local_peers); item != opal_list_get_end(&my_local_peers); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; if (0 > (rc = orte_rml.send_buffer(&nm->name, rbuf, ORTE_RML_TAG_ALLGATHER, 0))) { ORTE_ERROR_LOG(rc); return rc; } } } OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base.output, "%s grpcomm:hier allgather completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, bool release_req, struct ompi_request_t **request) { int ret, tag; ompi_osc_rdma_module_t *module = GET_MODULE(win); bool is_long_datatype = false; ompi_osc_rdma_frag_t *frag; ompi_osc_rdma_header_get_t *header; size_t ddt_len, frag_len; char *ptr; const void *packed_ddt; ompi_osc_rdma_request_t *rdma_request; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, win->w_name)); if (!ompi_osc_rdma_check_access_epoch (module, target)) { return OMPI_ERR_RMA_SYNC; } /* gets are always request based, so that we know where to land the data */ OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); if (NULL == rdma_request) { return OMPI_ERR_OUT_OF_RESOURCE; } rdma_request->internal = release_req; /* short-circuit case */ if (0 == origin_count || 0 == target_count) { ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); *request = &rdma_request->super; return OMPI_SUCCESS; } /* optimize self communication. TODO: optimize local communication */ if (ompi_comm_rank (module->comm) == target) { *request = &rdma_request->super; return ompi_osc_rdma_get_self (origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, module, rdma_request); } rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET; rdma_request->origin_addr = origin_addr; rdma_request->origin_count = origin_count; OBJ_RETAIN(origin_dt); rdma_request->origin_dt = origin_dt; /* Compute datatype length. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_dt); OPAL_THREAD_LOCK(&module->lock); frag_len = sizeof(ompi_osc_rdma_header_get_t) + ddt_len; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(ompi_osc_rdma_header_put_t) + 8; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } tag = get_tag (module); /* for bookkeeping the get is "outgoing" */ ompi_osc_signal_outgoing (module, target, 1); /* flush will be called at the end of this function. make sure the post message has * arrived. */ if (!release_req && module->sc_group) { while (0 != module->num_post_msgs) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); opal_condition_wait(&module->cond, &module->lock); } } OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_get_t*) ptr; header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->tag = tag; ptr += sizeof(ompi_osc_rdma_header_get_t); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE; OBJ_RETAIN(target_dt); ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag, module->comm, ompi_osc_rdma_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } /* TODO -- store the request somewhere so we can cancel it on error */ rdma_request->outstanding_requests = 1; ret = ompi_osc_rdma_irecv_w_cb (origin_addr, origin_count, origin_dt, target, tag, module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request); } while (0); if (OMPI_SUCCESS == ret) { header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; *request = &rdma_request->super; } OPAL_THREAD_LOCK(&module->lock); ret = ompi_osc_rdma_frag_finish(module, frag); if (!release_req) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_rdma_frag_flush_target (module, target); } OPAL_THREAD_UNLOCK(&module->lock); return ret; }
int ompi_osc_pt2pt_module_fence(int assert, ompi_win_t *win) { unsigned int incoming_reqs; int ret = OMPI_SUCCESS, i; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); int num_outgoing = 0; if (0 != (assert & MPI_MODE_NOPRECEDE)) { /* check that the user didn't lie to us - since NOPRECEDED must be specified by all processes if it is specified by any process, if we see this it is safe to assume that there are no pending operations anywhere needed to close out this epoch. No need to lock, since it's a lookup and any pending modification of the pending_sendreqs during this time is an erroneous program. */ if (0 != opal_list_get_size(&(module->p2p_pending_sendreqs))) { return MPI_ERR_RMA_SYNC; } } else { opal_list_item_t *item; /* "atomically" copy all the data we're going to be modifying into the copy... */ OPAL_THREAD_LOCK(&(module->p2p_lock)); ompi_osc_pt2pt_flip_sendreqs(module); OPAL_THREAD_UNLOCK(&(module->p2p_lock)); num_outgoing = opal_list_get_size(&(module->p2p_copy_pending_sendreqs)); /* find out how much data everyone is going to send us. */ ret = module->p2p_comm-> c_coll.coll_reduce_scatter(module->p2p_copy_num_pending_sendreqs, &incoming_reqs, module->p2p_fence_coll_counts, MPI_UNSIGNED, MPI_SUM, module->p2p_comm, module->p2p_comm->c_coll.coll_reduce_scatter_module); if (OMPI_SUCCESS != ret) { /* put the stupid data back for the user. This is not cheap, but the user lost his data if we don't. */ OPAL_THREAD_LOCK(&(module->p2p_lock)); opal_list_join(&module->p2p_pending_sendreqs, opal_list_get_end(&module->p2p_pending_sendreqs), &module->p2p_copy_pending_sendreqs); for (i = 0 ; i < ompi_comm_size(module->p2p_comm) ; ++i) { module->p2p_num_pending_sendreqs[i] += module->p2p_copy_num_pending_sendreqs[i]; } OPAL_THREAD_UNLOCK(&(module->p2p_lock)); return ret; } OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "fence: waiting on %d in and %d out", module->p2p_num_pending_in, module->p2p_num_pending_out)); /* try to start all the requests. We've copied everything we need out of pending_sendreqs, so don't need the lock here */ while (NULL != (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { ompi_osc_pt2pt_sendreq_t *req = (ompi_osc_pt2pt_sendreq_t*) item; ret = ompi_osc_pt2pt_sendreq_send(module, req); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret ) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); opal_list_append(&(module->p2p_copy_pending_sendreqs), item); } else if (OMPI_SUCCESS != ret) { return ret; } } OPAL_THREAD_LOCK(&module->p2p_lock); /* possible we've already received a couple in messages, so add however many we're going to wait for */ module->p2p_num_pending_in += incoming_reqs; module->p2p_num_pending_out += num_outgoing; /* now we know how many things we're waiting for - wait for them... */ while (module->p2p_num_pending_in > 0 || 0 != module->p2p_num_pending_out) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } OPAL_THREAD_UNLOCK(&module->p2p_lock); } /* all transfers are done - back to the real world we go */ if (0 == (assert & MPI_MODE_NOSUCCEED)) { ompi_win_set_mode(win, OMPI_WIN_FENCE); } else { ompi_win_set_mode(win, 0); } return OMPI_SUCCESS; }
static int ompi_osc_pt2pt_accumulate_w_req (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, ompi_win_t *win, ompi_osc_pt2pt_request_t *request) { int ret; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); bool is_long_datatype = false; bool is_long_msg = false; ompi_osc_pt2pt_frag_t *frag; ompi_osc_pt2pt_header_acc_t *header; ompi_osc_pt2pt_sync_t *pt2pt_sync; size_t ddt_len, payload_len, frag_len; char *ptr; const void *packed_ddt; int tag = -1; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "acc: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, win->w_name)); pt2pt_sync = ompi_osc_pt2pt_module_sync_lookup (module, target, NULL); if (OPAL_UNLIKELY(NULL == pt2pt_sync)) { return OMPI_ERR_RMA_SYNC; } /* short-circuit case */ if (0 == origin_count || 0 == target_count) { if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } return OMPI_SUCCESS; } /* optimize the self case. TODO: optimize the local case */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_pt2pt_acc_self (pt2pt_sync, origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, op, module, request); } /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_dt); payload_len = origin_dt->super.size * origin_count; frag_len = sizeof(*header) + ddt_len + payload_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, false, true); if (OMPI_SUCCESS != ret) { frag_len = sizeof(*header) + ddt_len; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, true, !request); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(*header) + 8; ret = ompi_osc_pt2pt_frag_alloc(module, target, frag_len, &frag, &ptr, true, !request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; tag = get_tag (module); } else { /* still need to set the tag for the active/passive logic on the target */ tag = !!(module->passive_target_access_epoch); } if (is_long_msg) { /* wait for synchronization before posting a long message */ if (pt2pt_sync->type == OMPI_OSC_PT2PT_SYNC_TYPE_LOCK) { OPAL_THREAD_LOCK(&pt2pt_sync->lock); ompi_osc_pt2pt_peer_t *peer = ompi_osc_pt2pt_peer_lookup (module, target); while (!(peer->flags & OMPI_OSC_PT2PT_PEER_FLAG_EAGER)) { opal_condition_wait(&pt2pt_sync->cond, &pt2pt_sync->lock); } OPAL_THREAD_UNLOCK(&pt2pt_sync->lock); } else { ompi_osc_pt2pt_sync_wait_expected (pt2pt_sync); } } header = (ompi_osc_pt2pt_header_acc_t*) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->op = op->o_f_to_c_index; header->tag = tag; ptr += sizeof (*header); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_LARGE_DATATYPE; OMPI_DATATYPE_RETAIN(target_dt); ret = ompi_osc_pt2pt_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag_to_target(tag), module->comm, ompi_osc_pt2pt_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } if (!is_long_msg) { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC; osc_pt2pt_hton(header, proc); osc_pt2pt_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_dt); /* the user's buffer is no longer needed so mark the request as * complete. */ if (request) { ompi_osc_pt2pt_request_complete (request, MPI_SUCCESS); } } else { header->base.type = OMPI_OSC_PT2PT_HDR_TYPE_ACC_LONG; osc_pt2pt_hton(header, proc); OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: starting long accumulate with tag %d", tag)); ret = ompi_osc_pt2pt_data_isend (module, origin_addr, origin_count, origin_dt, target, tag_to_target(tag), request); } } while (0); if (OMPI_SUCCESS != ret) { OPAL_OUTPUT_VERBOSE((25, ompi_osc_base_framework.framework_output, "acc: failed with eror %d", ret)); } else { /* mark the fragment as valid */ header->base.flags |= OMPI_OSC_PT2PT_HDR_FLAG_VALID; } return ompi_osc_pt2pt_frag_finish(module, frag); }
int ompi_request_default_wait_all( size_t count, ompi_request_t ** requests, ompi_status_public_t * statuses ) { size_t completed = 0, i, failed = 0; ompi_request_t **rptr; ompi_request_t *request; int mpi_error = OMPI_SUCCESS; rptr = requests; for (i = 0; i < count; i++) { request = *rptr++; if (request->req_complete == true) { if( OPAL_UNLIKELY( MPI_SUCCESS != request->req_status.MPI_ERROR ) ) { failed++; } completed++; } } if( failed > 0 ) { goto finish; } /* if all requests have not completed -- defer acquiring lock * unless required */ if (completed != count) { /* * acquire lock and test for completion - if all requests are * not completed pend on condition variable until a request * completes */ OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_waiting++; #if OPAL_ENABLE_MULTI_THREADS /* * confirm the status of the pending requests. We have to do it before * taking the condition or otherwise we can miss some requests completion (the * one that happpens between our initial test and the aquisition of the lock). */ rptr = requests; for( completed = i = 0; i < count; i++ ) { request = *rptr++; if (request->req_complete == true) { if( MPI_SUCCESS != request->req_status.MPI_ERROR ) { failed++; } completed++; } } if( failed > 0 ) { ompi_request_waiting--; OPAL_THREAD_UNLOCK(&ompi_request_lock); goto finish; } #endif /* OPAL_ENABLE_MULTI_THREADS */ while( completed != count ) { /* check number of pending requests */ size_t start = ompi_request_completed; size_t pending = count - completed; size_t start_failed = ompi_request_failed; /* * wait until at least pending requests complete */ while (pending > ompi_request_completed - start) { opal_condition_wait(&ompi_request_cond, &ompi_request_lock); /* * Check for failed requests. If one request fails, then * this operation completes in error marking the remaining * requests as PENDING. */ if( OPAL_UNLIKELY( 0 < (ompi_request_failed - start_failed) ) ) { failed += (ompi_request_failed - start_failed); ompi_request_waiting--; OPAL_THREAD_UNLOCK(&ompi_request_lock); goto finish; } } /* * confirm that all pending operations have completed. */ rptr = requests; for( failed = completed = i = 0; i < count; i++ ) { request = *rptr++; if (request->req_complete == true) { if( MPI_SUCCESS != request->req_status.MPI_ERROR ) { failed++; } completed++; } } } ompi_request_waiting--; OPAL_THREAD_UNLOCK(&ompi_request_lock); } #if OPAL_ENABLE_FT_CR == 1 if( opal_cr_is_enabled) { rptr = requests; for (i = 0; i < count; i++, rptr++) { request = *rptr; if( true == request->req_complete) { OMPI_CRCP_REQUEST_COMPLETE(request); } } } #endif finish: rptr = requests; if (MPI_STATUSES_IGNORE != statuses) { /* fill out status and free request if required */ for( i = 0; i < count; i++, rptr++ ) { request = *rptr; /* * Assert only if no requests were failed. * Since some may still be pending. */ if( 0 >= failed ) { assert( true == request->req_complete ); } if( request->req_state == OMPI_REQUEST_INACTIVE ) { statuses[i] = ompi_status_empty; continue; } if (OMPI_REQUEST_GEN == request->req_type) { ompi_grequest_invoke_query(request, &request->req_status); } statuses[i] = request->req_status; /* * Per MPI 2.2 p 60: * Allows requests to be marked as MPI_ERR_PENDING if they are * "neither failed nor completed." Which can only happen if * there was an error in one of the other requests. */ if( OPAL_UNLIKELY(0 < failed) ) { if( !request->req_complete ) { statuses[i].MPI_ERROR = MPI_ERR_PENDING; mpi_error = MPI_ERR_IN_STATUS; continue; } } if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; continue; } else { /* Only free the request if there is no error on it */ if (MPI_SUCCESS == request->req_status.MPI_ERROR) { /* If there's an error while freeing the request, assume that the request is still there. Otherwise, Bad Things will happen later! */ int tmp = ompi_request_free(rptr); if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) { mpi_error = tmp; } } } if( statuses[i].MPI_ERROR != OMPI_SUCCESS) { mpi_error = MPI_ERR_IN_STATUS; } } } else { /* free request if required */ for( i = 0; i < count; i++, rptr++ ) { int rc; request = *rptr; /* * Assert only if no requests were failed. * Since some may still be pending. */ if( 0 >= failed ) { assert( true == request->req_complete ); } else { /* If the request is still pending due to a failed request * then skip it in this loop. */ if( !request->req_complete ) { continue; } } /* Per note above, we have to call gen request query_fn even if STATUSES_IGNORE was provided */ if (OMPI_REQUEST_GEN == request->req_type) { rc = ompi_grequest_invoke_query(request, &request->req_status); } if( request->req_state == OMPI_REQUEST_INACTIVE ) { rc = ompi_status_empty.MPI_ERROR; } else { rc = request->req_status.MPI_ERROR; } if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; } else if (MPI_SUCCESS == rc) { /* Only free the request if there is no error on it */ int tmp = ompi_request_free(rptr); if (OMPI_SUCCESS == mpi_error && OMPI_SUCCESS != tmp) { mpi_error = tmp; } } /* * Per MPI 2.2 p34: * "It is possible for an MPI function to return MPI_ERR_IN_STATUS * even when MPI_STATUS_IGNORE or MPI_STATUSES_IGNORE has been * passed to that function." * So we should do so here as well. */ if( OMPI_SUCCESS == mpi_error && rc != OMPI_SUCCESS) { mpi_error = MPI_ERR_IN_STATUS; } } } return mpi_error; }
int ompi_osc_pt2pt_module_unlock(int target, ompi_win_t *win) { int32_t out_count; opal_list_item_t *item; int ret; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup( module->p2p_comm, target ); OPAL_THREAD_LOCK(&module->p2p_lock); while (0 == module->p2p_lock_received_ack) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } module->p2p_lock_received_ack -= 1; /* start all the requests */ ompi_osc_pt2pt_flip_sendreqs(module); /* try to start all the requests. We've copied everything we need out of pending_sendreqs, so don't need the lock here */ out_count = opal_list_get_size(&(module->p2p_copy_pending_sendreqs)); /* we want to send all the requests, plus we wait for one more completion event for the control message ack from the unlocker saying we're done */ module->p2p_num_pending_out += (out_count + 1); OPAL_THREAD_UNLOCK(&module->p2p_lock); /* send the unlock request */ OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: sending unlock request to %d with %d requests", ompi_comm_rank(module->p2p_comm), target, out_count)); ompi_osc_pt2pt_control_send(module, proc, OMPI_OSC_PT2PT_HDR_UNLOCK_REQ, ompi_comm_rank(module->p2p_comm), out_count); while (NULL != (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { ompi_osc_pt2pt_sendreq_t *req = (ompi_osc_pt2pt_sendreq_t*) item; ret = ompi_osc_pt2pt_sendreq_send(module, req); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret ) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); opal_list_append(&(module->p2p_copy_pending_sendreqs), item); } else if (OMPI_SUCCESS != ret) { return ret; } } /* wait for all the requests */ OPAL_THREAD_LOCK(&module->p2p_lock); while (0 != module->p2p_num_pending_out) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } OPAL_THREAD_UNLOCK(&module->p2p_lock); OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_output, "%d: finished unlock to %d", ompi_comm_rank(module->p2p_comm), target)); /* set our mode on the window */ ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_LOCK_ACCESS); return OMPI_SUCCESS; }
static int component_select(struct ompi_win_t *win, void **base, size_t size, int disp_unit, struct ompi_communicator_t *comm, struct ompi_info_t *info, int flavor, int *model) { ompi_osc_portals4_module_t *module = NULL; int ret = OMPI_ERROR; int tmp; ptl_md_t md; ptl_me_t me; char *name; if (MPI_WIN_FLAVOR_SHARED == flavor) return OMPI_ERR_NOT_SUPPORTED; /* create module structure */ module = (ompi_osc_portals4_module_t*) calloc(1, sizeof(ompi_osc_portals4_module_t)); if (NULL == module) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; /* fill in the function pointer part */ memcpy(module, &ompi_osc_portals4_module_template, sizeof(ompi_osc_base_module_t)); /* fill in our part */ if (MPI_WIN_FLAVOR_ALLOCATE == flavor) { module->free_after = *base = malloc(size); if (NULL == *base) goto error; } else { module->free_after = NULL; } ret = ompi_comm_dup(comm, &module->comm); if (OMPI_SUCCESS != ret) goto error; opal_output_verbose(1, ompi_osc_base_framework.framework_output, "portals4 component creating window with id %d", ompi_comm_get_cid(module->comm)); asprintf(&name, "portals4 window %d", ompi_comm_get_cid(module->comm)); ompi_win_set_name(win, name); free(name); /* share everyone's displacement units. Only do an allgather if strictly necessary, since it requires O(p) state. */ tmp = disp_unit; ret = module->comm->c_coll.coll_bcast(&tmp, 1, MPI_INT, 0, module->comm, module->comm->c_coll.coll_bcast_module); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: MPI_Bcast failed: %d\n", __FILE__, __LINE__, ret); goto error; } tmp = (tmp == disp_unit) ? 1 : 0; ret = module->comm->c_coll.coll_allreduce(MPI_IN_PLACE, &tmp, 1, MPI_INT, MPI_LAND, module->comm, module->comm->c_coll.coll_allreduce_module); if (OMPI_SUCCESS != ret) goto error; if (tmp == 1) { module->disp_unit = disp_unit; module->disp_units = NULL; } else { module->disp_unit = -1; module->disp_units = malloc(sizeof(int) * ompi_comm_size(module->comm)); ret = module->comm->c_coll.coll_allgather(&disp_unit, 1, MPI_INT, module->disp_units, 1, MPI_INT, module->comm, module->comm->c_coll.coll_allgather_module); if (OMPI_SUCCESS != ret) goto error; } module->ni_h = mca_osc_portals4_component.matching_ni_h; module->pt_idx = mca_osc_portals4_component.matching_pt_idx; ret = PtlCTAlloc(module->ni_h, &(module->ct_h)); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlCTAlloc failed: %d\n", __FILE__, __LINE__, ret); goto error; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = PTL_MD_EVENT_SUCCESS_DISABLE | PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; md.eq_handle = mca_osc_portals4_component.matching_eq_h; md.ct_handle = module->ct_h; ret = PtlMDBind(module->ni_h, &md, &module->md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); goto error; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = PTL_MD_EVENT_SEND_DISABLE | PTL_MD_EVENT_CT_REPLY | PTL_MD_EVENT_CT_ACK; md.eq_handle = mca_osc_portals4_component.matching_eq_h; md.ct_handle = module->ct_h; ret = PtlMDBind(module->ni_h, &md, &module->req_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); goto error; } if (MPI_WIN_FLAVOR_DYNAMIC == flavor) { me.start = 0; me.length = PTL_SIZE_MAX; } else { me.start = *base; me.length = size; } me.ct_handle = PTL_CT_NONE; me.uid = mca_osc_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = module->comm->c_contextid; me.ignore_bits = 0; ret = PtlMEAppend(module->ni_h, module->pt_idx, &me, PTL_PRIORITY_LIST, &module->ct_link, &module->data_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); goto error; } me.start = &module->state; me.length = sizeof(module->state); me.ct_handle = PTL_CT_NONE; me.uid = mca_osc_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_NO_TRUNCATE | PTL_ME_EVENT_SUCCESS_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = module->comm->c_contextid | OSC_PORTALS4_MB_CONTROL; me.ignore_bits = 0; ret = PtlMEAppend(module->ni_h, module->pt_idx, &me, PTL_PRIORITY_LIST, &module->ct_link, &module->control_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "%s:%d: PtlMEAppend failed: %d\n", __FILE__, __LINE__, ret); goto error; } module->opcount = 0; module->match_bits = module->comm->c_contextid; module->atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? mca_osc_portals4_component.matching_atomic_max : MIN(mca_osc_portals4_component.matching_atomic_max, mca_osc_portals4_component.matching_atomic_ordered_size); module->fetch_atomic_max = (check_config_value_equal("accumulate_ordering", info, "none")) ? mca_osc_portals4_component.matching_fetch_atomic_max : MIN(mca_osc_portals4_component.matching_fetch_atomic_max, mca_osc_portals4_component.matching_atomic_ordered_size); module->zero = 0; module->one = 1; module->start_group = NULL; module->post_group = NULL; module->state.post_count = 0; module->state.complete_count = 0; if (check_config_value_bool("no_locks", info)) { module->state.lock = LOCK_ILLEGAL; } else { module->state.lock = LOCK_UNLOCKED; } OBJ_CONSTRUCT(&module->outstanding_locks, opal_list_t); module->passive_target_access_epoch = false; #if OPAL_ASSEMBLY_ARCH == OPAL_AMD64 || OPAL_ASSEMBLY_ARCH == OPAL_IA32 *model = MPI_WIN_UNIFIED; #else *model = MPI_WIN_SEPARATE; #endif win->w_osc_module = &module->super; PtlAtomicSync(); /* Make sure that everyone's ready to receive. */ OPAL_THREAD_LOCK(&mca_osc_portals4_component.lock); while (module->ct_link != 2) { opal_condition_wait(&mca_osc_portals4_component.cond, &mca_osc_portals4_component.lock); } OPAL_THREAD_UNLOCK(&mca_osc_portals4_component.lock); module->comm->c_coll.coll_barrier(module->comm, module->comm->c_coll.coll_barrier_module); return OMPI_SUCCESS; error: /* BWB: FIX ME: This is all wrong... */ if (0 != module->ct_h) PtlCTFree(module->ct_h); if (0 != module->data_me_h) PtlMEUnlink(module->data_me_h); if (0 != module->req_md_h) PtlMDRelease(module->req_md_h); if (0 != module->md_h) PtlMDRelease(module->md_h); if (NULL != module->comm) ompi_comm_free(&module->comm); if (NULL != module) free(module); return ret; }
int ompi_osc_pt2pt_module_complete(ompi_win_t *win) { int i; int ret = OMPI_SUCCESS; ompi_group_t *group; opal_list_item_t *item; ompi_osc_pt2pt_module_t *module = P2P_MODULE(win); /* wait for all the post messages */ OPAL_THREAD_LOCK(&module->p2p_lock); while (0 != module->p2p_num_post_msgs) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } ompi_osc_pt2pt_flip_sendreqs(module); /* for each process in group, send a control message with number of updates coming, then start all the requests */ for (i = 0 ; i < ompi_group_size(module->p2p_sc_group) ; ++i) { int comm_rank = module->p2p_sc_remote_ranks[i]; module->p2p_num_pending_out += module->p2p_copy_num_pending_sendreqs[comm_rank]; } OPAL_THREAD_UNLOCK(&module->p2p_lock); for (i = 0 ; i < ompi_group_size(module->p2p_sc_group) ; ++i) { int comm_rank = module->p2p_sc_remote_ranks[i]; ret = ompi_osc_pt2pt_control_send(module, ompi_group_peer_lookup(module->p2p_sc_group, i), OMPI_OSC_PT2PT_HDR_COMPLETE, module->p2p_copy_num_pending_sendreqs[comm_rank], 0); assert(ret == OMPI_SUCCESS); } /* try to start all the requests. We've copied everything we need out of pending_sendreqs, so don't need the lock here */ while (NULL != (item = opal_list_remove_first(&(module->p2p_copy_pending_sendreqs)))) { ompi_osc_pt2pt_sendreq_t *req = (ompi_osc_pt2pt_sendreq_t*) item; ret = ompi_osc_pt2pt_sendreq_send(module, req); if (OMPI_ERR_TEMP_OUT_OF_RESOURCE == ret ) { opal_output_verbose(5, ompi_osc_base_output, "complete: failure in starting sendreq (%d). Will try later.", ret); opal_list_append(&(module->p2p_copy_pending_sendreqs), item); } else if (OMPI_SUCCESS != ret) { return ret; } } /* wait for all the requests */ OPAL_THREAD_LOCK(&module->p2p_lock); while (0 != module->p2p_num_pending_out) { opal_condition_wait(&module->p2p_cond, &module->p2p_lock); } group = module->p2p_sc_group; module->p2p_sc_group = NULL; OPAL_THREAD_UNLOCK(&module->p2p_lock); /* remove WIN_POSTED from our mode */ ompi_win_remove_mode(win, OMPI_WIN_ACCESS_EPOCH | OMPI_WIN_STARTED); ompi_group_decrement_proc_count(group); OBJ_RELEASE(group); return ret; }
int main(int argc, char *argv[]) { int ret = 0; int fd; opal_cmd_line_t *cmd_line = NULL; char *log_path = NULL; char log_file[PATH_MAX]; char *jobidstring; orte_gpr_value_t *value; char *segment; int i; orte_buffer_t answer; char *umask_str; /* Allow the PLS starters to pass us a umask to use, if required. Most starters by default can do something sane with the umask, but some (like TM) do not pass on the umask but instead inherit it form the root level process starter. This has to happen before opal_init and everything else so that the couple of places that stash a umask end up with the correct value. Only do it here (and not in orte_daemon) mainly to make it clear that this should only happen when starting an orted for the first time. All startes I'm aware of that don't require an orted are smart enough to pass on a reasonable umask, so they wouldn't need this functionality anyway. */ umask_str = getenv("ORTE_DAEMON_UMASK_VALUE"); if (NULL != umask_str) { char *endptr; long mask = strtol(umask_str, &endptr, 8); if ((! (0 == mask && (EINVAL == errno || ERANGE == errno))) && (*endptr == '\0')) { umask(mask); } } /* initialize the globals */ memset(&orted_globals, 0, sizeof(orted_globals_t)); /* Ensure that enough of OPAL is setup for us to be able to run */ if (OPAL_SUCCESS != opal_init_util()) { fprintf(stderr, "OPAL failed to initialize -- orted aborting\n"); exit(1); } /* save the environment for use when launching application processes */ orted_globals.saved_environ = opal_argv_copy(environ); /* setup mca param system */ mca_base_param_init(); /* setup to check common command line options that just report and die */ cmd_line = OBJ_NEW(opal_cmd_line_t); opal_cmd_line_create(cmd_line, orte_cmd_line_opts); if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, argc, argv))) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); opal_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return ret; } /* check for help request */ if (orted_globals.help) { char *args = NULL; args = opal_cmd_line_get_usage_msg(cmd_line); opal_show_help("help-orted.txt", "orted:usage", false, argv[0], args); free(args); return 1; } /* see if we were directed to separate from current session */ if (orted_globals.set_sid) { setsid(); } /* see if they want us to spin until they can connect a debugger to us */ i=0; while (orted_globals.spin) { i++; if (1000 < i) i=0; } /* Okay, now on to serious business! */ /* Ensure the process info structure in instantiated and initialized * and set the daemon flag to true */ orte_process_info.daemon = true; /* * If the daemon was given a name on the command line, need to set the * proper indicators in the environment so the name discovery service * can find it */ if (orted_globals.name) { if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds", "env", true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds", "env", ret); return ret; } if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_name", orted_globals.name, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_name", orted_globals.name, ret); return ret; } /* the following values are meaningless to the daemon, but may have * been passed in anyway. we set them here because the nds_env component * requires that they be set */ if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_vpid_start", orted_globals.vpid_start, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_vpid_start", orted_globals.vpid_start, ret); return ret; } if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds_num_procs", orted_globals.num_procs, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds_num_procs", orted_globals.num_procs, ret); return ret; } } if (orted_globals.ns_nds) { if (ORTE_SUCCESS != (ret = opal_setenv("OMPI_MCA_ns_nds", orted_globals.ns_nds, true, &environ))) { opal_show_help("help-orted.txt", "orted:environ", false, "OMPI_MCA_ns_nds", "env", ret); return ret; } } /* turn on debug if debug_file is requested so output will be generated */ if (orted_globals.debug_daemons_file) { orted_globals.debug_daemons = true; } /* detach from controlling terminal * otherwise, remain attached so output can get to us */ if(orted_globals.debug == false && orted_globals.debug_daemons == false && orted_globals.no_daemonize == false) { opal_daemon_init(NULL); } /* Intialize the Open RTE */ /* Set the flag telling orte_init that I am NOT a * singleton, but am "infrastructure" - prevents setting * up incorrect infrastructure that only a singleton would * require */ if (ORTE_SUCCESS != (ret = orte_init(true))) { opal_show_help("help-orted.txt", "orted:init-failure", false, "orte_init()", ret); return ret; } /* Set signal handlers to catch kill signals so we can properly clean up * after ourselves. */ opal_event_set(&term_handler, SIGTERM, OPAL_EV_SIGNAL, signal_callback, NULL); opal_event_add(&term_handler, NULL); opal_event_set(&int_handler, SIGINT, OPAL_EV_SIGNAL, signal_callback, NULL); opal_event_add(&int_handler, NULL); /* if requested, report my uri to the indicated pipe */ if (orted_globals.uri_pipe > 0) { write(orted_globals.uri_pipe, orte_universe_info.seed_uri, strlen(orte_universe_info.seed_uri)+1); /* need to add 1 to get the NULL */ close(orted_globals.uri_pipe); } /* setup stdout/stderr */ if (orted_globals.debug_daemons_file) { /* if we are debugging to a file, then send stdout/stderr to * the orted log file */ /* get my jobid */ if (ORTE_SUCCESS != (ret = orte_ns.get_jobid_string(&jobidstring, orte_process_info.my_name))) { ORTE_ERROR_LOG(ret); return ret; } /* define a log file name in the session directory */ sprintf(log_file, "output-orted-%s-%s.log", jobidstring, orte_system_info.nodename); log_path = opal_os_path(false, orte_process_info.tmpdir_base, orte_process_info.top_session_dir, log_file, NULL); fd = open(log_path, O_RDWR|O_CREAT|O_TRUNC, 0640); if (fd < 0) { /* couldn't open the file for some reason, so * just connect everything to /dev/null */ fd = open("/dev/null", O_RDWR|O_CREAT|O_TRUNC, 0666); } else { dup2(fd, STDOUT_FILENO); dup2(fd, STDERR_FILENO); if(fd != STDOUT_FILENO && fd != STDERR_FILENO) { close(fd); } } } /* output a message indicating we are alive, our name, and our pid * for debugging purposes */ if (orted_globals.debug_daemons) { fprintf(stderr, "Daemon [%ld,%ld,%ld] checking in as pid %ld on host %s\n", ORTE_NAME_ARGS(orte_process_info.my_name), (long)orte_process_info.pid, orte_system_info.nodename); } /* setup the thread lock and condition variables */ OBJ_CONSTRUCT(&orted_globals.mutex, opal_mutex_t); OBJ_CONSTRUCT(&orted_globals.condition, opal_condition_t); /* register the daemon main receive functions */ ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED, ORTE_RML_NON_PERSISTENT, orte_daemon_recv_pls, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; } ret = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON, ORTE_RML_NON_PERSISTENT, orte_daemon_recv, NULL); if (ret != ORTE_SUCCESS && ret != ORTE_ERR_NOT_IMPLEMENTED) { ORTE_ERROR_LOG(ret); return ret; } /* check to see if I'm a bootproxy */ if (orted_globals.bootproxy) { /* perform bootproxy-specific things */ if (orted_globals.mpi_call_yield > 0) { char *var; var = mca_base_param_environ_variable("mpi", NULL, "yield_when_idle"); opal_setenv(var, "1", true, &environ); } /* attach a subscription to the orted standard trigger so I can get * information on the processes I am to locally launch as soon as all * the orteds for this job are started. * * Once the registry gets to 2.0, we will be able to setup the * subscription so we only get our own launch info back. In the interim, * we setup the subscription so that ALL launch info for this job * is returned. We will then have to parse that message to get our * own local launch info. * * Since we have chosen this approach, we can take advantage of the * fact that the callback function will directly receive this data. * By setting up that callback function to actually perform the launch * based on the received data, all we have to do here is go into our * conditioned wait until the job completes! * * Sometimes, life can be good! :-) */ /** put all this registry stuff in a compound command to limit communications */ if (ORTE_SUCCESS != (ret = orte_gpr.begin_compound_cmd())) { ORTE_ERROR_LOG(ret); return ret; } /* let the local launcher setup a subscription for its required data. We * pass the local_cb_launcher function so that this gets called back - this * allows us to wakeup the orted so it can exit cleanly if the callback * generates an error */ if (ORTE_SUCCESS != (ret = orte_odls.subscribe_launch_data(orted_globals.bootproxy, orted_local_cb_launcher))) { ORTE_ERROR_LOG(ret); return ret; } /* get the job segment name */ if (ORTE_SUCCESS != (ret = orte_schema.get_job_segment_name(&segment, orted_globals.bootproxy))) { ORTE_ERROR_LOG(ret); return ret; } /** increment the orted stage gate counter */ if (ORTE_SUCCESS != (ret = orte_gpr.create_value(&value, ORTE_GPR_KEYS_OR|ORTE_GPR_TOKENS_AND, segment, 1, 1))) { ORTE_ERROR_LOG(ret); return ret; } free(segment); /* done with this now */ value->tokens[0] = strdup(ORTE_JOB_GLOBALS); if (ORTE_SUCCESS != (ret = orte_gpr.create_keyval(&(value->keyvals[0]), ORTED_LAUNCH_STAGE_GATE_CNTR, ORTE_UNDEF, NULL))) { ORTE_ERROR_LOG(ret); return ret; } /* do the increment */ if (ORTE_SUCCESS != (ret = orte_gpr.increment_value(value))) { ORTE_ERROR_LOG(ret); return ret; } OBJ_RELEASE(value); /* done with this now */ /** send the compound command */ if (ORTE_SUCCESS != (ret = orte_gpr.exec_compound_cmd())) { ORTE_ERROR_LOG(ret); return ret; } /* setup and enter the event monitor to wait for a wakeup call */ OPAL_THREAD_LOCK(&orted_globals.mutex); while (false == orted_globals.exit_condition) { opal_condition_wait(&orted_globals.condition, &orted_globals.mutex); } OPAL_THREAD_UNLOCK(&orted_globals.mutex); /* make sure our local procs are dead - but don't update their state * on the HNP as this may be redundant */ orte_odls.kill_local_procs(ORTE_JOBID_WILDCARD, false); /* cleanup their session directory */ orte_session_dir_cleanup(orted_globals.bootproxy); /* send an ack - we are as close to done as we can be while * still able to communicate */ OBJ_CONSTRUCT(&answer, orte_buffer_t); if (0 > orte_rml.send_buffer(ORTE_PROC_MY_HNP, &answer, ORTE_RML_TAG_PLS_ORTED_ACK, 0)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); } OBJ_DESTRUCT(&answer); /* Finalize and clean up ourselves */ if (ORTE_SUCCESS != (ret = orte_finalize())) { ORTE_ERROR_LOG(ret); } exit(ret); } /* * Set my process status to "running". Note that this must be done * after the rte init is completed. */ if (ORTE_SUCCESS != (ret = orte_smr.set_proc_state(orte_process_info.my_name, ORTE_PROC_STATE_RUNNING, 0))) { ORTE_ERROR_LOG(ret); return ret; } if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] ompid: issuing callback", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* go through the universe fields and see what else I need to do * - could be setup a virtual machine, spawn a console, etc. */ if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] ompid: setting up event monitor", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* setup and enter the event monitor */ OPAL_THREAD_LOCK(&orted_globals.mutex); while (false == orted_globals.exit_condition) { opal_condition_wait(&orted_globals.condition, &orted_globals.mutex); } OPAL_THREAD_UNLOCK(&orted_globals.mutex); if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] orted: mutex cleared - finalizing", ORTE_NAME_ARGS(orte_process_info.my_name)); } /* cleanup */ if (NULL != log_path) { unlink(log_path); } /* finalize the system */ orte_finalize(); if (orted_globals.debug_daemons) { opal_output(0, "[%lu,%lu,%lu] orted: done - exiting", ORTE_NAME_ARGS(orte_process_info.my_name)); } exit(0); }
int orte_pls_rsh_launch(orte_jobid_t jobid) { orte_job_map_t *map; opal_list_item_t *n_item; orte_mapped_node_t *rmaps_node; orte_std_cntr_t num_nodes; orte_vpid_t vpid; int node_name_index1; int node_name_index2; int proc_name_index; int local_exec_index, local_exec_index_end; char *jobid_string = NULL; char *uri, *param; char **argv = NULL, **tmp; char *prefix_dir; int argc; int rc; sigset_t sigs; struct passwd *p; bool remote_sh = false, remote_csh = false; bool local_sh = false, local_csh = false; char *lib_base = NULL, *bin_base = NULL; orte_pls_daemon_info_t *dmn; orte_pls_rsh_shell_t shell; if (mca_pls_rsh_component.timing) { if (0 != gettimeofday(&joblaunchstart, NULL)) { opal_output(0, "pls_rsh: could not obtain start time"); joblaunchstart.tv_sec = 0; joblaunchstart.tv_usec = 0; } } /* setup a list that will contain the info for all the daemons * so we can store it on the registry when done and use it * locally to track their state */ OBJ_CONSTRUCT(&active_daemons, opal_list_t); /* Get the map for this job * We need the entire mapping for a couple of reasons: * - need the prefix to start with. * - need to know the nodes we are launching on * All other mapping responsibilities fall to orted in the fork PLS */ rc = orte_rmaps.get_job_map(&map, jobid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&active_daemons); return rc; } /* if the user requested that we re-use daemons, * launch the procs on any existing, re-usable daemons */ if (orte_pls_base.reuse_daemons) { if (ORTE_SUCCESS != (rc = orte_pls_base_launch_on_existing_daemons(map))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return rc; } } num_nodes = (orte_std_cntr_t)opal_list_get_size(&map->nodes); if (0 == num_nodes) { /* nothing left to do - just return */ OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return ORTE_SUCCESS; } if (mca_pls_rsh_component.debug_daemons && mca_pls_rsh_component.num_concurrent < num_nodes) { /* we can't run in this situation, so pretty print the error * and exit */ opal_show_help("help-pls-rsh.txt", "deadlock-params", true, mca_pls_rsh_component.num_concurrent, num_nodes); OBJ_RELEASE(map); OBJ_DESTRUCT(&active_daemons); return ORTE_ERR_FATAL; } /* * After a discussion between Ralph & Jeff, we concluded that we * really are handling the prefix dir option incorrectly. It currently * is associated with an app_context, yet it really refers to the * location where OpenRTE/Open MPI is installed on a NODE. Fixing * this right now would involve significant change to orterun as well * as elsewhere, so we will intentionally leave this incorrect at this * point. The error, however, is identical to that seen in all prior * releases of OpenRTE/Open MPI, so our behavior is no worse than before. * * A note to fix this, along with ideas on how to do so, has been filed * on the project's Trac system under "feature enhancement". * * For now, default to the prefix_dir provided in the first app_context. * Since there always MUST be at least one app_context, we are safe in * doing this. */ prefix_dir = map->apps[0]->prefix_dir; /* * Allocate a range of vpids for the daemons. */ if (num_nodes == 0) { return ORTE_ERR_BAD_PARAM; } rc = orte_ns.reserve_range(0, num_nodes, &vpid); if (ORTE_SUCCESS != rc) { goto cleanup; } /* setup the orted triggers for passing their launch info */ if (ORTE_SUCCESS != (rc = orte_smr.init_orted_stage_gates(jobid, num_nodes, NULL, NULL))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* need integer value for command line parameter */ if (ORTE_SUCCESS != (rc = orte_ns.convert_jobid_to_string(&jobid_string, jobid))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* What is our local shell? */ shell = ORTE_PLS_RSH_SHELL_UNKNOWN; p = getpwuid(getuid()); if (NULL != p) { param = p->pw_shell; shell = find_shell(p->pw_shell); } /* If we didn't find it in getpwuid(), try looking at the $SHELL environment variable (see https://svn.open-mpi.org/trac/ompi/ticket/1060) */ if (ORTE_PLS_RSH_SHELL_UNKNOWN == shell && NULL != (param = getenv("SHELL"))) { shell = find_shell(param); } switch (shell) { case ORTE_PLS_RSH_SHELL_SH: /* fall through */ case ORTE_PLS_RSH_SHELL_KSH: /* fall through */ case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLS_RSH_SHELL_BASH: local_sh = true; break; case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLS_RSH_SHELL_CSH: local_csh = true; break; default: opal_output(0, "WARNING: local probe returned unhandled shell:%s assuming bash\n", (NULL != param) ? param : "unknown"); remote_sh = true; break; } if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: local csh: %d, local sh: %d\n", local_csh, local_sh); } /* What is our remote shell? */ if (mca_pls_rsh_component.assume_same_shell) { remote_sh = local_sh; remote_csh = local_csh; if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: assuming same remote shell as local shell"); } } else { orte_pls_rsh_shell_t shell; rmaps_node = (orte_mapped_node_t*)opal_list_get_first(&map->nodes); rc = orte_pls_rsh_probe(rmaps_node, &shell); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); return rc; } switch (shell) { case ORTE_PLS_RSH_SHELL_SH: /* fall through */ case ORTE_PLS_RSH_SHELL_KSH: /* fall through */ case ORTE_PLS_RSH_SHELL_ZSH: /* fall through */ case ORTE_PLS_RSH_SHELL_BASH: remote_sh = true; break; case ORTE_PLS_RSH_SHELL_TCSH: /* fall through */ case ORTE_PLS_RSH_SHELL_CSH: remote_csh = true; break; default: opal_output(0, "WARNING: rsh probe returned unhandled shell; assuming bash\n"); remote_sh = true; } } if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: remote csh: %d, remote sh: %d\n", remote_csh, remote_sh); } /* * Build argv array */ argv = opal_argv_copy(mca_pls_rsh_component.agent_argv); argc = mca_pls_rsh_component.agent_argc; node_name_index1 = argc; opal_argv_append(&argc, &argv, "<template>"); /* Do we need to source .profile on the remote side? */ if (!(remote_csh || remote_sh)) { int i; tmp = opal_argv_split("( test ! -r ./.profile || . ./.profile;", ' '); if (NULL == tmp) { return ORTE_ERR_OUT_OF_RESOURCE; } for (i = 0; NULL != tmp[i]; ++i) { opal_argv_append(&argc, &argv, tmp[i]); } opal_argv_free(tmp); } /* add the daemon command (as specified by user) */ local_exec_index = argc; opal_argv_append(&argc, &argv, mca_pls_rsh_component.orted); /* check for debug flags */ orte_pls_base_mca_argv(&argc, &argv); opal_argv_append(&argc, &argv, "--bootproxy"); opal_argv_append(&argc, &argv, jobid_string); opal_argv_append(&argc, &argv, "--name"); proc_name_index = argc; opal_argv_append(&argc, &argv, "<template>"); /* tell the daemon how many procs are in the daemon's job */ opal_argv_append(&argc, &argv, "--num_procs"); asprintf(¶m, "%lu", (unsigned long)(vpid + num_nodes)); opal_argv_append(&argc, &argv, param); free(param); /* tell the daemon the starting vpid of the daemon's job */ opal_argv_append(&argc, &argv, "--vpid_start"); opal_argv_append(&argc, &argv, "0"); opal_argv_append(&argc, &argv, "--nodename"); node_name_index2 = argc; opal_argv_append(&argc, &argv, "<template>"); /* pass along the universe name and location info */ opal_argv_append(&argc, &argv, "--universe"); asprintf(¶m, "%s@%s:%s", orte_universe_info.uid, orte_universe_info.host, orte_universe_info.name); opal_argv_append(&argc, &argv, param); free(param); /* setup ns contact info */ opal_argv_append(&argc, &argv, "--nsreplica"); if (NULL != orte_process_info.ns_replica_uri) { uri = strdup(orte_process_info.ns_replica_uri); } else { uri = orte_rml.get_uri(); } asprintf(¶m, "\"%s\"", uri); opal_argv_append(&argc, &argv, param); free(uri); free(param); /* setup gpr contact info */ opal_argv_append(&argc, &argv, "--gprreplica"); if (NULL != orte_process_info.gpr_replica_uri) { uri = strdup(orte_process_info.gpr_replica_uri); } else { uri = orte_rml.get_uri(); } asprintf(¶m, "\"%s\"", uri); opal_argv_append(&argc, &argv, param); free(uri); free(param); local_exec_index_end = argc; if (!(remote_csh || remote_sh)) { opal_argv_append(&argc, &argv, ")"); } if (mca_pls_rsh_component.debug) { param = opal_argv_join(argv, ' '); if (NULL != param) { opal_output(0, "pls:rsh: final template argv:"); opal_output(0, "pls:rsh: %s", param); free(param); } } /* Figure out the basenames for the libdir and bindir. This requires some explanation: - Use opal_install_dirs.libdir and opal_install_dirs.bindir instead of -D'ing some macros in this directory's Makefile.am because it makes all the dependencies work out correctly. These are defined in opal/install_dirs.h. - After a discussion on the devel-core mailing list, the developers decided that we should use the local directory basenames as the basis for the prefix on the remote note. This does not handle a few notable cases (e.g., f the libdir/bindir is not simply a subdir under the prefix, if the libdir/bindir basename is not the same on the remote node as it is here in the local node, etc.), but we decided that --prefix was meant to handle "the common case". If you need something more complex than this, a) edit your shell startup files to set PATH/LD_LIBRARY_PATH properly on the remove node, or b) use some new/to-be-defined options that explicitly allow setting the bindir/libdir on the remote node. We decided to implement these options (e.g., --remote-bindir and --remote-libdir) to orterun when it actually becomes a problem for someone (vs. a hypothetical situation). Hence, for now, we simply take the basename of this install's libdir and bindir and use it to append this install's prefix and use that on the remote node. */ lib_base = opal_basename(opal_install_dirs.libdir); bin_base = opal_basename(opal_install_dirs.bindir); /* * Iterate through each of the nodes */ if (mca_pls_rsh_component.timing) { /* allocate space to track the start times */ launchstart = (struct timeval*)malloc((num_nodes+vpid) * sizeof(struct timeval)); } for(n_item = opal_list_get_first(&map->nodes); n_item != opal_list_get_end(&map->nodes); n_item = opal_list_get_next(n_item)) { orte_process_name_t* name; pid_t pid; char *exec_path; char **exec_argv; rmaps_node = (orte_mapped_node_t*)n_item; if (mca_pls_rsh_component.timing) { if (0 != gettimeofday(&launchstart[vpid], NULL)) { opal_output(0, "pls_rsh: could not obtain start time"); } } /* new daemon - setup to record its info */ dmn = OBJ_NEW(orte_pls_daemon_info_t); dmn->active_job = jobid; opal_list_append(&active_daemons, &dmn->super); /* setup node name */ free(argv[node_name_index1]); if (NULL != rmaps_node->username && 0 != strlen (rmaps_node->username)) { asprintf (&argv[node_name_index1], "%s@%s", rmaps_node->username, rmaps_node->nodename); } else { argv[node_name_index1] = strdup(rmaps_node->nodename); } free(argv[node_name_index2]); argv[node_name_index2] = strdup(rmaps_node->nodename); /* save it in the daemon info */ dmn->nodename = strdup(rmaps_node->nodename); /* initialize daemons process name */ rc = orte_ns.create_process_name(&name, rmaps_node->cell, 0, vpid); if (ORTE_SUCCESS != rc) { ORTE_ERROR_LOG(rc); goto cleanup; } /* save it in the daemon info */ dmn->cell = rmaps_node->cell; if (ORTE_SUCCESS != (rc = orte_dss.copy((void**)&(dmn->name), name, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* fork a child to exec the rsh/ssh session */ /* set the process state to "launched" */ if (ORTE_SUCCESS != (rc = orte_smr.set_proc_state(name, ORTE_PROC_STATE_LAUNCHED, 0))) { ORTE_ERROR_LOG(rc); goto cleanup; } pid = fork(); if (pid < 0) { rc = ORTE_ERR_OUT_OF_RESOURCE; goto cleanup; } /* child */ if (pid == 0) { char* name_string; char** env; char* var; long fd, fdmax = sysconf(_SC_OPEN_MAX); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: launching on node %s\n", rmaps_node->nodename); } /* We don't need to sense an oversubscribed condition and set the sched_yield * for the node as we are only launching the daemons at this time. The daemons * are now smart enough to set the oversubscribed condition themselves when * they launch the local procs. */ /* Is this a local launch? * * Not all node names may be resolvable (if we found * localhost in the hostfile, for example). So first * check trivial case of node_name being same as the * current nodename, which must be local. If that doesn't * match, check using ifislocal(). */ if (!mca_pls_rsh_component.force_rsh && (0 == strcmp(rmaps_node->nodename, orte_system_info.nodename) || opal_ifislocal(rmaps_node->nodename))) { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: %s is a LOCAL node\n", rmaps_node->nodename); } if (mca_pls_rsh_component.timing) { /* since this is a local launch, the daemon will never reach * the waitpid callback - so set the start value to * something nonsensical */ launchstart[vpid].tv_sec = 0; launchstart[vpid].tv_usec = 0; } exec_path = opal_path_findv(argv[local_exec_index], 0, environ, NULL); if (NULL == exec_path && NULL == prefix_dir) { rc = orte_pls_rsh_fill_exec_path (&exec_path); if (ORTE_SUCCESS != rc) { exit(-1); /* the forked process MUST exit */ } } else { if (NULL != prefix_dir) { exec_path = opal_os_path( false, prefix_dir, bin_base, "orted", NULL ); } /* If we yet did not fill up the execpath, do so now */ if (NULL == exec_path) { rc = orte_pls_rsh_fill_exec_path (&exec_path); if (ORTE_SUCCESS != rc) { exit(-1); /* the forked process MUST exit */ } } } /* If we have a prefix, then modify the PATH and LD_LIBRARY_PATH environment variables. We're already in the child process, so it's ok to modify environ. */ if (NULL != prefix_dir) { char *oldenv, *newenv; /* Reset PATH */ newenv = opal_os_path( false, prefix_dir, bin_base, NULL ); oldenv = getenv("PATH"); if (NULL != oldenv) { char *temp; asprintf(&temp, "%s:%s", newenv, oldenv ); free( newenv ); newenv = temp; } opal_setenv("PATH", newenv, true, &environ); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: reset PATH: %s", newenv); } free(newenv); /* Reset LD_LIBRARY_PATH */ newenv = opal_os_path( false, prefix_dir, lib_base, NULL ); oldenv = getenv("LD_LIBRARY_PATH"); if (NULL != oldenv) { char* temp; asprintf(&temp, "%s:%s", newenv, oldenv); free(newenv); newenv = temp; } opal_setenv("LD_LIBRARY_PATH", newenv, true, &environ); if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: reset LD_LIBRARY_PATH: %s", newenv); } free(newenv); } /* Since this is a local execution, we need to potentially whack the final ")" in the argv (if sh/csh conditionals, from above). Note that we're modifying the argv[] in the child process, so there's no need to save this and restore it afterward -- the parent's argv[] is unmodified. */ if (NULL != argv[local_exec_index_end]) { free(argv[local_exec_index_end]); argv[local_exec_index_end] = NULL; } /* tell the daemon to setup its own process session/group */ opal_argv_append(&argc, &argv, "--set-sid"); exec_argv = &argv[local_exec_index]; /* Finally, chdir($HOME) because we're making the assumption that this is what will happen on remote nodes (via rsh/ssh). This allows a user to specify a path that is relative to $HOME for both the cwd and argv[0] and it will work on all nodes -- including the local nost. Otherwise, it would work on remote nodes and not the local node. If the user does not start in $HOME on the remote nodes... well... let's hope they start in $HOME. :-) */ var = getenv("HOME"); if (NULL != var) { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: changing to directory %s", var); } /* Ignore errors -- what are we going to do? (and we ignore errors on the remote nodes in the fork pls, so this is consistent) */ chdir(var); } } else { if (mca_pls_rsh_component.debug) { opal_output(0, "pls:rsh: %s is a REMOTE node\n", rmaps_node->nodename); } exec_argv = argv; exec_path = strdup(mca_pls_rsh_component.agent_path); if (NULL != prefix_dir) { char *opal_prefix = getenv("OPAL_PREFIX"); if (remote_sh) { asprintf (&argv[local_exec_index], "%s%s%s PATH=%s/%s:$PATH ; export PATH ; " "LD_LIBRARY_PATH=%s/%s:$LD_LIBRARY_PATH ; export LD_LIBRARY_PATH ; " "%s/%s/%s", (opal_prefix != NULL ? "OPAL_PREFIX=" : ""), (opal_prefix != NULL ? opal_prefix : ""), (opal_prefix != NULL ? " ;" : ""), prefix_dir, bin_base, prefix_dir, lib_base, prefix_dir, bin_base, mca_pls_rsh_component.orted); } if (remote_csh) { /* [t]csh is a bit more challenging -- we have to check whether LD_LIBRARY_PATH is already set before we try to set it. Must be very careful about obeying [t]csh's order of evaluation and not using a variable before it is defined. See this thread for more details: http://www.open-mpi.org/community/lists/users/2006/01/0517.php. */ asprintf (&argv[local_exec_index], "%s%s%s set path = ( %s/%s $path ) ; " "if ( $?LD_LIBRARY_PATH == 1 ) " "set OMPI_have_llp ; " "if ( $?LD_LIBRARY_PATH == 0 ) " "setenv LD_LIBRARY_PATH %s/%s ; " "if ( $?OMPI_have_llp == 1 ) " "setenv LD_LIBRARY_PATH %s/%s:$LD_LIBRARY_PATH ; " "%s/%s/%s", (opal_prefix != NULL ? "setenv OPAL_PREFIX " : ""), (opal_prefix != NULL ? opal_prefix : ""), (opal_prefix != NULL ? " ;" : ""), prefix_dir, bin_base, prefix_dir, lib_base, prefix_dir, lib_base, prefix_dir, bin_base, mca_pls_rsh_component.orted); } } } /* setup process name */ rc = orte_ns.get_proc_name_string(&name_string, name); if (ORTE_SUCCESS != rc) { opal_output(0, "orte_pls_rsh: unable to create process name"); exit(-1); } free(argv[proc_name_index]); argv[proc_name_index] = strdup(name_string); if (!mca_pls_rsh_component.debug) { /* setup stdin */ int fd = open("/dev/null", O_RDWR); dup2(fd, 0); close(fd); } /* close all file descriptors w/ exception of stdin/stdout/stderr */ for(fd=3; fd<fdmax; fd++) close(fd); /* Set signal handlers back to the default. Do this close to the execve() because the event library may (and likely will) reset them. If we don't do this, the event library may have left some set that, at least on some OS's, don't get reset via fork() or exec(). Hence, the orted could be unkillable (for example). */ set_handler_default(SIGTERM); set_handler_default(SIGINT); set_handler_default(SIGHUP); set_handler_default(SIGPIPE); set_handler_default(SIGCHLD); /* Unblock all signals, for many of the same reasons that we set the default handlers, above. This is noticable on Linux where the event library blocks SIGTERM, but we don't want that blocked by the orted (or, more specifically, we don't want it to be blocked by the orted and then inherited by the ORTE processes that it forks, making them unkillable by SIGTERM). */ sigprocmask(0, 0, &sigs); sigprocmask(SIG_UNBLOCK, &sigs, 0); /* setup environment */ env = opal_argv_copy(environ); var = mca_base_param_environ_variable("seed",NULL,NULL); opal_setenv(var, "0", true, &env); /* exec the daemon */ if (mca_pls_rsh_component.debug) { param = opal_argv_join(exec_argv, ' '); if (NULL != param) { char* env_array = opal_argv_join( env, ' ' ); opal_output(0, "pls:rsh: executing: (%s) %s [%s]", exec_path, param, env_array); free(param); free(env_array); } } execve(exec_path, exec_argv, env); opal_output(0, "pls:rsh: execv of %s failed with errno=%s(%d)\n", exec_path, strerror(errno), errno); exit(-1); } else { /* father */ OPAL_THREAD_LOCK(&mca_pls_rsh_component.lock); /* JJH Bug: * If we are in '--debug-daemons' we keep the ssh connection * alive for the span of the run. If we use this option * AND we launch on more than "num_concurrent" machines * then we will deadlock. No connections are terminated * until the job is complete, no job is started * since all the orteds are waiting for all the others * to come online, and the others ore not launched because * we are waiting on those that have started to terminate * their ssh tunnels. :( */ if (mca_pls_rsh_component.num_children++ >= mca_pls_rsh_component.num_concurrent) { opal_condition_wait(&mca_pls_rsh_component.cond, &mca_pls_rsh_component.lock); } OPAL_THREAD_UNLOCK(&mca_pls_rsh_component.lock); /* setup callback on sigchild - wait until setup above is complete * as the callback can occur in the call to orte_wait_cb */ orte_wait_cb(pid, orte_pls_rsh_wait_daemon, dmn); /* if required - add delay to avoid problems w/ X11 authentication */ if (mca_pls_rsh_component.debug && mca_pls_rsh_component.delay) { sleep(mca_pls_rsh_component.delay); } vpid++; } free(name); } /* all done, so store the daemon info on the registry */ if (ORTE_SUCCESS != (rc = orte_pls_base_store_active_daemons(&active_daemons))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_RELEASE(map); if (NULL != lib_base) { free(lib_base); } if (NULL != bin_base) { free(bin_base); } if (NULL != jobid_string) free(jobid_string); /* done with this variable */ if (NULL != argv) opal_argv_free(argv); return rc; }
int orte_pls_base_orted_add_local_procs(opal_list_t *daemons, orte_gpr_notify_data_t *ndat) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_ADD_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; OPAL_TRACE(1); /* pack the command */ OBJ_CONSTRUCT(&cmd, orte_buffer_t); if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* pack the launch data for the daemons */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &ndat, 1, ORTE_GPR_NOTIFY_DATA))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } for (item = opal_list_get_first(daemons); item != opal_list_get_end(daemons); item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } orted_cmd_num_active++; } OBJ_DESTRUCT(&cmd); /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait for the command to have been received */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); return ORTE_SUCCESS; CLEANUP: OBJ_DESTRUCT(&cmd); return rc; }
int ompi_request_default_wait_some( size_t count, ompi_request_t ** requests, int * outcount, int * indices, ompi_status_public_t * statuses) { #if OMPI_ENABLE_PROGRESS_THREADS int c; #endif size_t i, num_requests_null_inactive=0, num_requests_done=0; int rc = MPI_SUCCESS; ompi_request_t **rptr=NULL; ompi_request_t *request=NULL; *outcount = 0; for (i = 0; i < count; i++) { indices[i] = 0; } #if OMPI_ENABLE_PROGRESS_THREADS /* poll for completion */ OPAL_THREAD_ADD32(&ompi_progress_thread_count,1); for (c = 0; c < opal_progress_spin_count; c++) { rptr = requests; num_requests_null_inactive = 0; num_requests_done = 0; for (i = 0; i < count; i++, rptr++) { request = *rptr; /* * Check for null or completed persistent request. * For MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE */ if (request->req_state == OMPI_REQUEST_INACTIVE ) { num_requests_null_inactive++; continue; } if (true == request->req_complete) { indices[i] = 1; num_requests_done++; } } if (num_requests_null_inactive == count || num_requests_done > 0) { OPAL_THREAD_ADD32(&ompi_progress_thread_count,-1); goto finished; } opal_progress(); } OPAL_THREAD_ADD32(&ompi_progress_thread_count,-1); #endif /* * We only get here when outcount still is 0. * give up and sleep until completion */ OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_waiting++; do { rptr = requests; num_requests_null_inactive = 0; num_requests_done = 0; for (i = 0; i < count; i++, rptr++) { request = *rptr; /* * Check for null or completed persistent request. * For MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE. */ if( request->req_state == OMPI_REQUEST_INACTIVE ) { num_requests_null_inactive++; continue; } if (request->req_complete == true) { indices[i] = 1; num_requests_done++; } } if (num_requests_null_inactive == count || num_requests_done > 0) break; opal_condition_wait(&ompi_request_cond, &ompi_request_lock); } while (1); ompi_request_waiting--; OPAL_THREAD_UNLOCK(&ompi_request_lock); #if OMPI_ENABLE_PROGRESS_THREADS finished: #endif /* OMPI_ENABLE_PROGRESS_THREADS */ #if OPAL_ENABLE_FT_CR == 1 if( opal_cr_is_enabled) { rptr = requests; for (i = 0; i < count; i++, rptr++) { request = *rptr; if( true == request->req_complete) { OMPI_CRCP_REQUEST_COMPLETE(request); } } } #endif if(num_requests_null_inactive == count) { *outcount = MPI_UNDEFINED; } else { /* * Compress the index array. */ for (i = 0, num_requests_done = 0; i < count; i++) { if (0 != indices[i]) { indices[num_requests_done++] = i; } } *outcount = num_requests_done; for (i = 0; i < num_requests_done; i++) { request = requests[indices[i]]; assert( true == request->req_complete ); /* Per note above, we have to call gen request query_fn even if STATUS_IGNORE was provided */ if (OMPI_REQUEST_GEN == request->req_type) { ompi_grequest_invoke_query(request, &request->req_status); } if (MPI_STATUSES_IGNORE != statuses) { statuses[i] = request->req_status; } if (MPI_SUCCESS != request->req_status.MPI_ERROR) { rc = MPI_ERR_IN_STATUS; } if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; } else { /* Only free the request if there was no error */ if (MPI_SUCCESS == request->req_status.MPI_ERROR) { int tmp; tmp = ompi_request_free(&(requests[indices[i]])); if (OMPI_SUCCESS != tmp) { return tmp; } } } } } return rc; }
int orte_pls_base_orted_kill_local_procs(opal_list_t *daemons, orte_jobid_t job, struct timeval *timeout) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_KILL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; opal_event_t* event = NULL; OPAL_TRACE(1); OBJ_CONSTRUCT(&cmd, orte_buffer_t); /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&cmd); return rc; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &job, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&cmd); return rc; } /* send the commands as fast as we can */ for (item = opal_list_get_first(daemons); item != opal_list_get_end(daemons); item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } orted_cmd_num_active++; } OBJ_DESTRUCT(&cmd); /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* define the default completion status */ completion_status = ORTE_SUCCESS; /* wait for all commands to have been received */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { /* setup a delay to give the orteds time to complete their departure - wake us up if they * don't exit by the prescribed time */ if (NULL != timeout && /* only do this if the user gave us a time to wait */ NULL != (event = (opal_event_t*)malloc(sizeof(opal_event_t)))) { opal_evtimer_set(event, orte_pls_base_orted_default_wakeup, NULL); opal_evtimer_add(event, timeout); } /* now go to sleep until woken up */ opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); /* log an error if one occurred */ if (ORTE_SUCCESS != completion_status) { ORTE_ERROR_LOG(completion_status); } /* if started, kill the timer event so it doesn't hit us later */ if (NULL != event) { opal_evtimer_del(event); free(event); } /* we're done! */ return completion_status; }
void orte_grpcomm_base_daemon_collective(orte_process_name_t *sender, opal_buffer_t *data) { orte_jobid_t jobid; orte_odls_job_t *jobdat; orte_routed_tree_t *child; orte_std_cntr_t n; opal_list_t daemon_tree; opal_list_item_t *item, *next; int32_t num_contributors; opal_buffer_t buf; orte_process_name_t my_parent, proc; orte_vpid_t daemonvpid; int rc; int32_t numc; orte_rml_tag_t rmltag; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective called", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* unpack the jobid using this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* lookup the job record for it */ jobdat = NULL; for (item = opal_list_get_first(&orte_local_jobdata); item != opal_list_get_end(&orte_local_jobdata); item = opal_list_get_next(item)) { jobdat = (orte_odls_job_t*)item; /* is this the specified job? */ if (jobdat->jobid == jobid) { break; } } if (NULL == jobdat) { /* race condition - someone sent us a collective before we could * parse the add_local_procs cmd. Just add the jobdat object * and continue */ jobdat = OBJ_NEW(orte_odls_job_t); jobdat->jobid = jobid; opal_list_append(&orte_local_jobdata, &jobdat->super); } /* it may be possible to get here prior to having actually finished processing our * local launch msg due to the race condition between different nodes and when * they start their individual procs. Hence, we have to first ensure that we * -have- finished processing the launch msg, or else we won't know whether * or not to wait before sending this on */ OPAL_THREAD_LOCK(&jobdat->lock); while (!jobdat->launch_msg_processed) { opal_condition_wait(&jobdat->cond, &jobdat->lock); } OPAL_THREAD_UNLOCK(&jobdat->lock); /* unpack the tag for this collective */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &rmltag, &n, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* unpack the number of contributors in this data bucket */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &num_contributors, &n, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } jobdat->num_contributors += num_contributors; /* xfer the data */ opal_dss.copy_payload(&jobdat->collection_bucket, data); /* count the number of participants collected */ jobdat->num_collected++; /* if we haven't already done so, figure out how many participants we * should be expecting */ if (jobdat->num_participating < 0) { if (0 < jobdat->num_local_procs) { /* we have children, so account for our own participation */ jobdat->num_participating = 1; } else { jobdat->num_participating = 0; } /* now see if anyone else will be sending us something */ OBJ_CONSTRUCT(&daemon_tree, opal_list_t); orte_routed.get_routing_tree(&daemon_tree); /* unfortunately, there is no simple way to determine which of our "child" * daemons in the routing tree will be sending us something. All we can do * is brute force a search, though we attempt to keep it as short as possible */ proc.jobid = jobid; proc.vpid = 0; while (proc.vpid < jobdat->num_procs && 0 < opal_list_get_size(&daemon_tree)) { ORTE_EPOCH_SET(proc.epoch,orte_ess.proc_get_epoch(&proc)); /* get the daemon that hosts this proc */ daemonvpid = orte_ess.proc_get_daemon(&proc); /* is this daemon one of our children, or at least its contribution * will pass through one of our children */ item = opal_list_get_first(&daemon_tree); while (item != opal_list_get_end(&daemon_tree)) { next = opal_list_get_next(item); child = (orte_routed_tree_t*)item; if (child->vpid == daemonvpid || opal_bitmap_is_set_bit(&child->relatives, daemonvpid)) { /* it does - add to num_participating */ jobdat->num_participating++; /* remove this from the list so we don't double count it */ opal_list_remove_item(&daemon_tree, item); /* done with search */ break; } item = next; } proc.vpid++; } } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective for job %s from %s type %ld" " num_collected %d num_participating %d num_contributors %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid), ORTE_NAME_PRINT(sender), (long)jobdat->collective_type, jobdat->num_collected, jobdat->num_participating, jobdat->num_contributors)); if (jobdat->num_collected == jobdat->num_participating) { /* if I am the HNP, go process the results */ if (ORTE_PROC_IS_HNP) { goto hnp_process; } /* if I am not the HNP, send to my parent */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* pack the target tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &rmltag, 1, ORTE_RML_TAG))) { ORTE_ERROR_LOG(rc); return; } /* pack the number of contributors */ if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &jobdat->num_contributors, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); return; } /* xfer the payload*/ opal_dss.copy_payload(&buf, &jobdat->collection_bucket); /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send it */ my_parent.jobid = ORTE_PROC_MY_NAME->jobid; my_parent.vpid = orte_routed.get_routing_tree(NULL); ORTE_EPOCH_SET(my_parent.epoch,orte_ess.proc_get_epoch(&my_parent)); OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective not the HNP - sending to parent %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&my_parent))); if (0 > (rc = orte_rml.send_buffer(&my_parent, &buf, ORTE_RML_TAG_DAEMON_COLLECTIVE, 0))) { ORTE_ERROR_LOG(rc); return; } OBJ_DESTRUCT(&buf); } return; hnp_process: OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base.output, "%s grpcomm:base:daemon_coll: daemon collective HNP - xcasting to job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jobid))); /* setup a buffer to send the results back to the job members */ OBJ_CONSTRUCT(&buf, opal_buffer_t); /* add any collected data */ numc = jobdat->num_contributors; if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &numc, 1, OPAL_INT32))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_SUCCESS != (rc = opal_dss.copy_payload(&buf, &jobdat->collection_bucket))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* reset everything for next collective */ jobdat->num_contributors = 0; jobdat->num_collected = 0; OBJ_DESTRUCT(&jobdat->collection_bucket); OBJ_CONSTRUCT(&jobdat->collection_bucket, opal_buffer_t); /* send the buffer */ if (ORTE_SUCCESS != (rc = orte_grpcomm.xcast(jobid, &buf, rmltag))) { ORTE_ERROR_LOG(rc); } cleanup: OBJ_DESTRUCT(&buf); return; }
int ompi_request_default_wait_any( size_t count, ompi_request_t ** requests, int *index, ompi_status_public_t * status) { #if OMPI_ENABLE_PROGRESS_THREADS int c; #endif size_t i=0, num_requests_null_inactive=0; int rc = OMPI_SUCCESS; int completed = -1; ompi_request_t **rptr=NULL; ompi_request_t *request=NULL; #if OMPI_ENABLE_PROGRESS_THREADS /* poll for completion */ OPAL_THREAD_ADD32(&ompi_progress_thread_count,1); for (c = 0; completed < 0 && c < opal_progress_spin_count; c++) { rptr = requests; num_requests_null_inactive = 0; for (i = 0; i < count; i++, rptr++) { request = *rptr; /* * Check for null or completed persistent request. * For MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE */ if( request->req_state == OMPI_REQUEST_INACTIVE ) { num_requests_null_inactive++; continue; } if (true == request->req_complete) { completed = i; OPAL_THREAD_ADD32(&ompi_progress_thread_count,-1); goto finished; } } if( num_requests_null_inactive == count ) { OPAL_THREAD_ADD32(&ompi_progress_thread_count,-1); goto finished; } opal_progress(); } OPAL_THREAD_ADD32(&ompi_progress_thread_count,-1); #endif /* give up and sleep until completion */ OPAL_THREAD_LOCK(&ompi_request_lock); ompi_request_waiting++; do { rptr = requests; num_requests_null_inactive = 0; for (i = 0; i < count; i++, rptr++) { request = *rptr; /* Sanity test */ if( NULL == request) { continue; } /* * Check for null or completed persistent request. * For MPI_REQUEST_NULL, the req_state is always OMPI_REQUEST_INACTIVE. */ if( request->req_state == OMPI_REQUEST_INACTIVE ) { num_requests_null_inactive++; continue; } if (request->req_complete == true) { completed = i; break; } } if(num_requests_null_inactive == count) break; if (completed < 0) { opal_condition_wait(&ompi_request_cond, &ompi_request_lock); } } while (completed < 0); ompi_request_waiting--; OPAL_THREAD_UNLOCK(&ompi_request_lock); #if OMPI_ENABLE_PROGRESS_THREADS finished: #endif /* OMPI_ENABLE_PROGRESS_THREADS */ if(num_requests_null_inactive == count) { *index = MPI_UNDEFINED; if (MPI_STATUS_IGNORE != status) { *status = ompi_status_empty; } } else { assert( true == request->req_complete ); /* Per note above, we have to call gen request query_fn even if STATUS_IGNORE was provided */ if (OMPI_REQUEST_GEN == request->req_type) { rc = ompi_grequest_invoke_query(request, &request->req_status); } if (MPI_STATUS_IGNORE != status) { /* Do *NOT* set status->MPI_ERROR here! See MPI-1.1 doc, sec 3.2.5, p.22 */ int old_error = status->MPI_ERROR; *status = request->req_status; status->MPI_ERROR = old_error; } rc = request->req_status.MPI_ERROR; if( request->req_persistent ) { request->req_state = OMPI_REQUEST_INACTIVE; } else if (MPI_SUCCESS == rc) { /* Only free the request if there is no error on it */ /* If there's an error while freeing the request, assume that the request is still there. Otherwise, Bad Things will happen later! */ rc = ompi_request_free(rptr); } *index = completed; } #if OPAL_ENABLE_FT_CR == 1 if( opal_cr_is_enabled) { rptr = requests; for (i = 0; i < count; i++, rptr++) { request = *rptr; if( true == request->req_complete) { OMPI_CRCP_REQUEST_COMPLETE(request); } } } #endif return rc; }
int orte_pls_base_orted_signal_local_procs(opal_list_t *daemons, int32_t signal) { int rc; orte_buffer_t cmd; orte_daemon_cmd_flag_t command=ORTE_DAEMON_SIGNAL_LOCAL_PROCS; opal_list_item_t *item; orte_pls_daemon_info_t *dmn; OPAL_TRACE(1); OBJ_CONSTRUCT(&cmd, orte_buffer_t); /* pack the command */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = orte_dss.pack(&cmd, &signal, 1, ORTE_INT32))) { ORTE_ERROR_LOG(rc); goto CLEANUP; } /* send the commands as fast as we can */ for (item = opal_list_get_first(daemons); item != opal_list_get_end(daemons); item = opal_list_get_next(item)) { dmn = (orte_pls_daemon_info_t*)item; if (0 > orte_rml.send_buffer_nb(dmn->name, &cmd, ORTE_RML_TAG_PLS_ORTED, 0, orte_pls_base_orted_send_cb, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_DESTRUCT(&cmd); return rc; } orted_cmd_num_active++; } /* post the receive for the ack's */ rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLS_ORTED_ACK, ORTE_RML_NON_PERSISTENT, orte_pls_base_cmd_ack, NULL); if (rc != ORTE_SUCCESS) { ORTE_ERROR_LOG(rc); return rc; } /* wait for all commands to have been received */ OPAL_THREAD_LOCK(&orte_pls_base.orted_cmd_lock); if (orted_cmd_num_active > 0) { opal_condition_wait(&orte_pls_base.orted_cmd_cond, &orte_pls_base.orted_cmd_lock); } OPAL_THREAD_UNLOCK(&orte_pls_base.orted_cmd_lock); CLEANUP: OBJ_DESTRUCT(&cmd); /* we're done! */ return ORTE_SUCCESS; }
int orte_rml_oob_send(orte_process_name_t* peer, struct iovec *iov, int count, int tag, int flags) { orte_rml_oob_msg_t *msg = OBJ_NEW(orte_rml_oob_msg_t); int ret; orte_process_name_t next; int real_tag; int i; int bytes = 0; if (ORTE_RML_TAG_INVALID == tag) { /* cannot send to an invalid tag */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } msg->msg_type = ORTE_RML_BLOCKING_SEND; flags |= ORTE_RML_FLAG_RECURSIVE_CALLBACK; next = orte_routed.get_route(peer); if (next.vpid == ORTE_VPID_INVALID) { ORTE_ERROR_LOG(ORTE_ERR_ADDRESSEE_UNKNOWN); opal_output(0, "%s could not get route to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer)); return ORTE_ERR_ADDRESSEE_UNKNOWN; } msg->msg_data = (struct iovec *) malloc(sizeof(struct iovec) * (count + 1)); msg->msg_data[0].iov_base = (ompi_iov_base_ptr_t)&msg->msg_header; msg->msg_data[0].iov_len = sizeof(orte_rml_oob_msg_header_t); bytes += msg->msg_data[0].iov_len; for (i = 0 ; i < count ; ++i) { msg->msg_data[i + 1].iov_base = iov[i].iov_base; msg->msg_data[i + 1].iov_len = iov[i].iov_len; bytes += msg->msg_data[i + 1].iov_len; } msg->msg_header.origin = *ORTE_PROC_MY_NAME; msg->msg_header.destination = *peer; msg->msg_header.tag = tag; ORTE_RML_OOB_MSG_HEADER_HTON(msg->msg_header); if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &next, peer)) { real_tag = tag; } else { real_tag = ORTE_RML_TAG_RML_ROUTE; } OPAL_OUTPUT_VERBOSE((1, orte_rml_base_output, "rml_send %s -> %s (router %s, tag %d, %d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), ORTE_NAME_PRINT(&next), tag, real_tag)); ret = orte_rml_oob_module.active_oob->oob_send_nb(&next, ORTE_PROC_MY_NAME, msg->msg_data, count + 1, real_tag, flags, orte_rml_send_msg_callback, msg); if (ret < 0) { ORTE_ERROR_LOG(ret); opal_output(0, "%s attempted to send to %s: tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&next), (int)real_tag); goto cleanup; } OPAL_THREAD_LOCK(&msg->msg_lock); while (!msg->msg_complete) { opal_condition_wait(&msg->msg_cond, &msg->msg_lock); } ret = msg->msg_status; OPAL_THREAD_UNLOCK(&msg->msg_lock); cleanup: OBJ_RELEASE(msg); return ret; }