int orte_util_add_hostfile_nodes(opal_list_t *nodes, bool *override_oversubscribed, char *hostfile) { opal_list_t exclude; opal_list_item_t *item, *itm; int rc; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s hostfile: checking hostfile %s for nodes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); OBJ_CONSTRUCT(&exclude, opal_list_t); /* parse the hostfile and add the contents to the list */ if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, false))) { goto cleanup; } /* parse the nodes to check for any relative node directives */ for (item = opal_list_get_first(nodes); item != opal_list_get_end(nodes); item = opal_list_get_next(item)) { orte_node_t *node=(orte_node_t*)item; if ('+' == node->name[0]) { orte_show_help("help-hostfile.txt", "hostfile:relative-syntax", true, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } } /* remove from the list of nodes those that are in the exclude list */ while(NULL != (item = opal_list_remove_first(&exclude))) { orte_node_t *exnode = (orte_node_t*)item; /* check for matches on nodes */ for (itm = opal_list_get_first(nodes); itm != opal_list_get_end(nodes); itm = opal_list_get_next(itm)) { orte_node_t *node=(orte_node_t*)itm; if (0 == strcmp(exnode->name, node->name)) { /* match - remove it */ opal_list_remove_item(nodes, itm); OBJ_RELEASE(itm); break; } } OBJ_RELEASE(item); } /* indicate that ORTE should override any oversubscribed conditions * based on local hardware limits since the user (a) might not have * provided us any info on the #slots for a node, and (b) the user * might have been wrong! If we don't check the number of local physical * processors, then we could be too aggressive on our sched_yield setting * and cause performance problems. */ *override_oversubscribed = true; cleanup: OBJ_DESTRUCT(&exclude); return rc; }
int orte_rml_oob_send_nb(struct orte_rml_base_module_t *mod, orte_process_name_t* peer, struct iovec* iov, int count, orte_rml_tag_t tag, orte_rml_callback_fn_t cbfunc, void* cbdata) { orte_rml_recv_t *rcv; orte_rml_send_t *snd; int bytes; orte_self_send_xfer_t *xfer; int i; char* ptr; OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_send to peer %s at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(peer), tag)); if (ORTE_RML_TAG_INVALID == tag) { /* cannot send to an invalid tag */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } if (NULL == peer || OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, ORTE_NAME_INVALID, peer)) { /* cannot send to an invalid peer */ ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); return ORTE_ERR_BAD_PARAM; } OPAL_TIMING_EVENT((&tm_rml, "to %s", ORTE_NAME_PRINT(peer))); /* if this is a message to myself, then just post the message * for receipt - no need to dive into the oob */ if (OPAL_EQUAL == orte_util_compare_name_fields(ORTE_NS_CMP_ALL, peer, ORTE_PROC_MY_NAME)) { /* local delivery */ OPAL_OUTPUT_VERBOSE((1, orte_rml_base_framework.framework_output, "%s rml_send_iovec_to_self at tag %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tag)); /* send to self is a tad tricky - we really don't want * to track the send callback function throughout the recv * process and execute it upon receipt as this would provide * very different timing from a non-self message. Specifically, * if we just retain a pointer to the incoming data * and then execute the send callback prior to the receive, * then the caller will think we are done with the data and * can release it. So we have to copy the data in order to * execute the send callback prior to receiving the message. * * In truth, this really is a better mimic of the non-self * message behavior. If we actually pushed the message out * on the wire and had it loop back, then we would receive * a new block of data anyway. */ /* setup the send callback */ xfer = OBJ_NEW(orte_self_send_xfer_t); xfer->iov = iov; xfer->count = count; xfer->cbfunc.iov = cbfunc; xfer->tag = tag; xfer->cbdata = cbdata; /* setup the event for the send callback */ opal_event_set(orte_event_base, &xfer->ev, -1, OPAL_EV_WRITE, send_self_exe, xfer); opal_event_set_priority(&xfer->ev, ORTE_MSG_PRI); opal_event_active(&xfer->ev, OPAL_EV_WRITE, 1); /* copy the message for the recv */ rcv = OBJ_NEW(orte_rml_recv_t); rcv->sender = *peer; rcv->tag = tag; /* get the total number of bytes in the iovec array */ bytes = 0; for (i = 0 ; i < count ; ++i) { bytes += iov[i].iov_len; } /* get the required memory allocation */ if (0 < bytes) { rcv->iov.iov_base = (IOVBASE_TYPE*)malloc(bytes); rcv->iov.iov_len = bytes; /* transfer the bytes */ ptr = (char*)rcv->iov.iov_base; for (i = 0 ; i < count ; ++i) { memcpy(ptr, iov[i].iov_base, iov[i].iov_len); ptr += iov[i].iov_len; } } /* post the message for receipt - since the send callback was posted * first and has the same priority, it will execute first */ ORTE_RML_ACTIVATE_MESSAGE(rcv); return ORTE_SUCCESS; } snd = OBJ_NEW(orte_rml_send_t); snd->dst = *peer; snd->origin = *ORTE_PROC_MY_NAME; snd->tag = tag; snd->iov = iov; snd->count = count; snd->cbfunc.iov = cbfunc; snd->cbdata = cbdata; snd->routed = strdup(mod->routed); /* activate the OOB send state */ ORTE_OOB_SEND(snd); return ORTE_SUCCESS; }
int ompi_osc_portals4_rget(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_portals4_request_t *request; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length; size_t offset; ptl_handle_md_t md_h; void *md_base; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget: 0x%lx, %d, %s, %d, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, (unsigned long) win)); OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; *ompi_req = &request->super; offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); opal_output(ompi_osc_base_framework.framework_output, "MPI_Rget: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected = 1; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= origin_count; ompi_osc_portals4_get_md(origin_addr, module->req_md_h, &md_h, &md_base); ret = PtlGet(md_h, (ptl_size_t) ((char*) origin_addr - (char*) md_base), length, peer, module->pt_idx, module->match_bits, offset, request); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } } return OMPI_SUCCESS; }
/* Target EQ */ static int portals4_progress(void) { int count = 0, ret; ptl_event_t ev; ompi_coll_portals4_request_t *ptl_request; while (true) { ret = PtlEQGet(mca_coll_portals4_component.eq_h, &ev); if (PTL_OK == ret) { OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "event type=%s\n", evname[ev.type])); count++; switch (ev.type) { case PTL_EVENT_PUT: /* Non-Blocking / request */ if (PTL_OK == ev.ni_fail_type) { OPAL_OUTPUT_VERBOSE((50, ompi_coll_base_framework.framework_output, "hdr_data %p, matchbits 0x%lx", (void*) ev.hdr_data, ev.match_bits)); assert(0 != ev.hdr_data); ptl_request = (ompi_coll_portals4_request_t*) ev.hdr_data; assert(NULL != ptl_request); switch (ptl_request->type) { case OMPI_COLL_PORTALS4_TYPE_BARRIER: ompi_coll_portals4_ibarrier_intra_fini(ptl_request); break; case OMPI_COLL_PORTALS4_TYPE_BCAST: ompi_coll_portals4_ibcast_intra_fini(ptl_request); break; case OMPI_COLL_PORTALS4_TYPE_REDUCE: ompi_coll_portals4_ireduce_intra_fini(ptl_request); break; case OMPI_COLL_PORTALS4_TYPE_ALLREDUCE: ompi_coll_portals4_iallreduce_intra_fini(ptl_request); break; case OMPI_COLL_PORTALS4_TYPE_SCATTER: ompi_coll_portals4_iscatter_intra_fini(ptl_request); break; case OMPI_COLL_PORTALS4_TYPE_GATHER: ompi_coll_portals4_igather_intra_fini(ptl_request); break; } } if (PTL_OK != ev.ni_fail_type) { OPAL_OUTPUT_VERBOSE((10, ompi_coll_base_framework.framework_output, "ni_fail_type=%s\n", failtype[ev.ni_fail_type])); } break; default: opal_output(ompi_coll_base_framework.framework_output, "Unexpected event of type %d", ev.type); break; } } else if (PTL_EQ_EMPTY == ret) { break; } else if (PTL_EQ_DROPPED == ret) { opal_output(ompi_coll_base_framework.framework_output, "Flow control situation without recovery (EQ_DROPPED)\n"); abort(); } else { opal_output(ompi_coll_base_framework.framework_output, "Error returned from PtlEQGet: %d", ret); break; } } return count; }
/* ////////////////////////////////////////////////////////////////////////// */ static int segment_create(opal_shmem_ds_t *ds_buf, const char *file_name, size_t size) { int rc = OPAL_SUCCESS; char *real_file_name = NULL; pid_t my_pid = getpid(); bool space_available = false; uint64_t amount_space_avail = 0; /* the real size of the shared memory segment. this includes enough space * to store our segment header. */ size_t real_size = size + sizeof(opal_shmem_seg_hdr_t); opal_shmem_seg_hdr_t *seg_hdrp = MAP_FAILED; /* init the contents of opal_shmem_ds_t */ shmem_ds_reset(ds_buf); /* change the path of shmem mmap's backing store? */ if (0 != opal_shmem_mmap_relocate_backing_file) { int err; if (path_usable(opal_shmem_mmap_backing_file_base_dir, &err)) { if (NULL == (real_file_name = get_uniq_file_name(opal_shmem_mmap_backing_file_base_dir, file_name))) { /* out of resources */ return OPAL_ERROR; } } /* a relocated backing store was requested, but the path specified * cannot be used :-(. if the flag is negative, then warn and continue * with the default path. otherwise, fail. */ else if (opal_shmem_mmap_relocate_backing_file < 0) { opal_output(0, "shmem: mmap: WARNING: could not relocate " "backing store to \"%s\" (%s). Continuing with " "default path.\n", opal_shmem_mmap_backing_file_base_dir, strerror(err)); } /* must be positive, so fail */ else { opal_output(0, "shmem: mmap: WARNING: could not relocate " "backing store to \"%s\" (%s). Cannot continue with " "shmem mmap.\n", opal_shmem_mmap_backing_file_base_dir, strerror(err)); return OPAL_ERROR; } } /* are we using the default path? */ if (NULL == real_file_name) { /* use the path specified by the caller of this function */ if (NULL == (real_file_name = strdup(file_name))) { /* out of resources */ return OPAL_ERROR; } } OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_framework.framework_output, "%s: %s: backing store base directory: %s\n", mca_shmem_mmap_component.super.base_version.mca_type_name, mca_shmem_mmap_component.super.base_version.mca_component_name, real_file_name) ); /* determine whether the specified filename is on a network file system. * this is an important check because if the backing store is located on * a network filesystem, the user may see a shared memory performance hit. */ if (opal_shmem_mmap_nfs_warning && opal_path_nfs(real_file_name)) { char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "mmap on nfs", 1, hn, real_file_name); } /* let's make sure we have enough space for the backing file */ if (OPAL_SUCCESS != (rc = enough_space(real_file_name, real_size, &amount_space_avail, &space_available))) { opal_output(0, "shmem: mmap: an error occurred while determining " "whether or not %s could be created.", real_file_name); /* rc is set */ goto out; } if (!space_available) { char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; rc = OPAL_ERR_OUT_OF_RESOURCE; opal_show_help("help-opal-shmem-mmap.txt", "target full", 1, real_file_name, hn, (unsigned long)real_size, (unsigned long long)amount_space_avail); goto out; } /* enough space is available, so create the segment */ if (-1 == (ds_buf->seg_id = open(real_file_name, O_CREAT | O_RDWR, 0600))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "open(2)", "", strerror(err), err); rc = OPAL_ERROR; goto out; } /* size backing file - note the use of real_size here */ if (0 != ftruncate(ds_buf->seg_id, real_size)) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "ftruncate(2)", "", strerror(err), err); rc = OPAL_ERROR; goto out; } if (MAP_FAILED == (seg_hdrp = (opal_shmem_seg_hdr_t *) mmap(NULL, real_size, PROT_READ | PROT_WRITE, MAP_SHARED, ds_buf->seg_id, 0))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "mmap(2)", "", strerror(err), err); rc = OPAL_ERROR; goto out; } /* all is well */ else { /* -- initialize the shared memory segment -- */ opal_atomic_rmb(); /* init segment lock */ opal_atomic_init(&seg_hdrp->lock, OPAL_ATOMIC_UNLOCKED); /* i was the creator of this segment, so note that fact */ seg_hdrp->cpid = my_pid; opal_atomic_wmb(); /* -- initialize the contents of opal_shmem_ds_t -- */ ds_buf->seg_cpid = my_pid; ds_buf->seg_size = real_size; ds_buf->seg_base_addr = (unsigned char *)seg_hdrp; (void)strncpy(ds_buf->seg_name, real_file_name, OPAL_PATH_MAX - 1); /* set "valid" bit because setment creation was successful */ OPAL_SHMEM_DS_SET_VALID(ds_buf); OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_framework.framework_output, "%s: %s: create successful " "(id: %d, size: %lu, name: %s)\n", mca_shmem_mmap_component.super.base_version.mca_type_name, mca_shmem_mmap_component.super.base_version.mca_component_name, ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); } out: /* in this component, the id is the file descriptor returned by open. this * check is here to see if it is safe to call close on the file descriptor. * that is, we are making sure that our call to open was successful and * we are not not in an error path. */ if (-1 != ds_buf->seg_id) { if (0 != close(ds_buf->seg_id)) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "close(2)", "", strerror(err), err); rc = OPAL_ERROR; } } /* an error occured, so invalidate the shmem object and munmap if needed */ if (OPAL_SUCCESS != rc) { if (MAP_FAILED != seg_hdrp) { munmap((void *)seg_hdrp, real_size); } shmem_ds_reset(ds_buf); } /* safe to free now because its contents have already been copied */ if (NULL != real_file_name) { free(real_file_name); } return rc; }
int orte_grpcomm_base_comm_start(void) { int rc; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:receive start comm", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (!recv_issued) { if (ORTE_PROC_IS_HNP || ORTE_PROC_IS_DAEMON) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE, ORTE_RML_PERSISTENT, daemon_local_recv, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_XCAST, ORTE_RML_PERSISTENT, orte_grpcomm_base_xcast_recv, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON_COLL, ORTE_RML_PERSISTENT, daemon_coll_recv, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } if (ORTE_PROC_IS_DAEMON) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ROLLUP, ORTE_RML_PERSISTENT, orte_grpcomm_base_rollup_recv, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } } if (ORTE_PROC_IS_HNP) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLL_ID_REQ, ORTE_RML_PERSISTENT, coll_id_req, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } } recv_issued = true; } else if (ORTE_PROC_IS_APP) { if (ORTE_SUCCESS != (rc = orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_COLLECTIVE, ORTE_RML_PERSISTENT, app_recv, NULL))) { ORTE_ERROR_LOG(rc); recv_issued = false; return rc; } recv_issued = true; } } return ORTE_SUCCESS; }
int mca_btl_portals4_get(struct mca_btl_base_module_t* btl_base, struct mca_btl_base_endpoint_t* btl_peer, void *local_address, uint64_t remote_address, struct mca_btl_base_registration_handle_t *local_handle, struct mca_btl_base_registration_handle_t *remote_handle, size_t size, int flags, int order, mca_btl_base_rdma_completion_fn_t cbfunc, void *cbcontext, void *cbdata) { mca_btl_portals4_module_t *portals4_btl = (mca_btl_portals4_module_t *) btl_base; mca_btl_portals4_frag_t *frag = NULL; ptl_md_t md; int ret; /* reserve space in the event queue for rdma operations immediately */ while (OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, 1) > portals4_btl->portals_max_outstanding_ops) { OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "Call to mca_btl_portals4_component_progress (1)\n")); mca_btl_portals4_component_progress(); } OPAL_BTL_PORTALS4_FRAG_ALLOC_USER(portals4_btl, frag); if (NULL == frag){ OPAL_THREAD_ADD32(&portals4_btl->portals_outstanding_ops, -1); return OPAL_ERROR; } OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "mca_btl_portals4_get: Incrementing portals_outstanding_ops=%d frag=%p", portals4_btl->portals_outstanding_ops, (void *)frag)); frag->rdma_cb.func = cbfunc; frag->rdma_cb.context = cbcontext; frag->rdma_cb.data = cbdata; frag->rdma_cb.local_handle = local_handle; frag->endpoint = btl_peer; frag->hdr.tag = MCA_BTL_TAG_MAX; if (OPAL_UNLIKELY(PTL_OK != ret)) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d", __FILE__, __LINE__, ret); return OPAL_ERROR; } frag->match_bits = remote_handle->key; frag->addr = local_address; frag->length = size; frag->peer_proc = btl_peer->ptl_proc; OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "PtlGet start=%p length=%ld nid=%x pid=%x match_bits=%lx\n", md.start, md.length, btl_peer->ptl_proc.phys.nid, btl_peer->ptl_proc.phys.pid, frag->match_bits)); ret = PtlGet(portals4_btl->send_md_h, (ptl_size_t) local_address, size, btl_peer->ptl_proc, portals4_btl->recv_idx, frag->match_bits, /* match bits */ 0, frag); if (OPAL_UNLIKELY(PTL_OK != ret)) { opal_output_verbose(1, opal_btl_base_framework.framework_output, "%s:%d: PtlGet failed: %d", __FILE__, __LINE__, ret); return OPAL_ERROR; } OPAL_OUTPUT_VERBOSE((90, opal_btl_base_framework.framework_output, "SUCCESS: PtlGet start=%p length=%ld nid=%x pid=%x match_bits=%lx\n", md.start, md.length, btl_peer->ptl_proc.phys.nid, btl_peer->ptl_proc.phys.pid, frag->match_bits)); return OPAL_SUCCESS; }
void orte_state_base_track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata; int i; opal_output_verbose(5, orte_state_base_framework.framework_output, "%s state:base:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state)); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; if (jdata->num_launched == jdata->num_procs) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_READY_FOR_DEBUGGERS); } else { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_RUNNING); } } } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_REGISTERED); } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* update the proc state */ pdata->state = state; /* Release only the stdin IOF file descriptor for this child, if one * was defined. File descriptors for the other IOF channels - stdout, * stderr, and stddiag - were released when their associated pipes * were cleared and closed due to termination of the process */ if (NULL != orte_iof.close) { orte_iof.close(proc, ORTE_IOF_STDIN); } ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* update the proc state */ pdata->state = state; ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* update the proc state */ ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_LOCAL)) { /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); } /* if we are trying to terminate and our routes are * gone, then terminate ourselves IF no local procs * remain (might be some from another job) */ if (orte_orteds_term_ordered && 0 == orte_routed.num_routes()) { for (i=0; i < orte_local_children->size; i++) { if (NULL != (pdata = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i)) && ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_ALIVE)) { /* at least one is still alive */ goto cleanup; } } /* call our appropriate exit procedure */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:base all routes and children gone - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(NULL, ORTE_JOB_STATE_DAEMONS_TERMINATED); goto cleanup; } /* return the allocated slot for reuse */ cleanup_node(pdata); /* track job status */ jdata->num_terminated++; if (jdata->num_terminated == jdata->num_procs) { ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_TERMINATED); } } cleanup: OBJ_RELEASE(caddy); }
void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; int32_t i32, *i32ptr; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } i32ptr = &i32; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), i32, (1 == i32) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jdata->state))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes()) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing procs from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; for (j=1; j < orte_job_data->size; j++) { if (NULL == (job = (orte_job_t*)opal_pointer_array_get_item(orte_job_data, j))) { /* since we are releasing jdata objects as we * go, we can no longer assume that the job_data * array is left justified */ continue; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } opal_pointer_array_set_item(orte_job_data, j, NULL); /* ensure the array has a NULL */ OBJ_RELEASE(jdata); } } continue; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { continue; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (job->num_terminated < job->num_procs) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* stop the job timeout event, if set */ if (NULL != orte_mpiexec_timeout) { OBJ_RELEASE(orte_mpiexec_timeout); orte_mpiexec_timeout = NULL; } /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }
int orte_regex_extract_node_names(char *regexp, char ***names) { int i, j, len, ret; char *base; char *orig; bool found_range = false; bool more_to_come = false; if (NULL == regexp) { *names = NULL; return ORTE_SUCCESS; } orig = base = strdup(regexp); if (NULL == base) { ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE); return ORTE_ERR_OUT_OF_RESOURCE; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s regex:extract:nodenames: checking nodelist: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), regexp)); do { /* Find the base */ len = strlen(base); for (i = 0; i <= len; ++i) { if (base[i] == '[') { /* we found a range. this gets dealt with below */ base[i] = '\0'; found_range = true; break; } if (base[i] == ',') { /* we found a singleton node, and there are more to come */ base[i] = '\0'; found_range = false; more_to_come = true; break; } if (base[i] == '\0') { /* we found a singleton node */ found_range = false; more_to_come = false; break; } } if(i == 0) { /* we found a special character at the beginning of the string */ orte_show_help("help-regex.txt", "regex:special-char", true, regexp); free(orig); return ORTE_ERR_BAD_PARAM; } if (found_range) { /* If we found a range, now find the end of the range */ for (j = i; j < len; ++j) { if (base[j] == ']') { base[j] = '\0'; break; } } if (j >= len) { /* we didn't find the end of the range */ orte_show_help("help-regex.txt", "regex:end-range-missing", true, regexp); free(orig); return ORTE_ERR_BAD_PARAM; } ret = regex_parse_node_ranges(base, base + i + 1, names); if(ORTE_SUCCESS != ret) { orte_show_help("help-regex.txt", "regex:bad-value", true, regexp); free(orig); return ret; } if(base[j + 1] == ',') { more_to_come = true; base = &base[j + 2]; } else { more_to_come = false; } } else { /* If we didn't find a range, just add the node */ OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s regex:extract:nodenames: found node: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), base)); if(ORTE_SUCCESS != (ret = opal_argv_append_nosize(names, base))) { ORTE_ERROR_LOG(ret); free(orig); return ret; } /* set base equal to the (possible) next base to look at */ base = &base[i + 1]; } } while(more_to_come); free(orig); /* All done */ return ret; }
void orte_state_base_activate_job_state(orte_job_t *jdata, orte_job_state_t state) { opal_list_item_t *itm, *any=NULL, *error=NULL; orte_state_t *s; orte_state_caddy_t *caddy; for (itm = opal_list_get_first(&orte_job_states); itm != opal_list_get_end(&orte_job_states); itm = opal_list_get_next(itm)) { s = (orte_state_t*)itm; if (s->job_state == ORTE_JOB_STATE_ANY) { /* save this place */ any = itm; } if (s->job_state == ORTE_JOB_STATE_ERROR) { error = itm; } if (s->job_state == state) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING JOB %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state), s->priority)); if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s NULL CBFUNC FOR JOB %s STATE %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "ALL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state))); return; } caddy = OBJ_NEW(orte_state_caddy_t); if (NULL != jdata) { caddy->jdata = jdata; caddy->job_state = state; OBJ_RETAIN(jdata); } opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); return; } } /* if we get here, then the state wasn't found, so execute * the default handler if it is defined */ if (ORTE_JOB_STATE_ERROR < state && NULL != error) { s = (orte_state_t*)error; } else if (NULL != any) { s = (orte_state_t*)any; } else { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "ACTIVATE: ANY STATE NOT FOUND")); return; } if (NULL == s->cbfunc) { OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "ACTIVATE: ANY STATE HANDLER NOT DEFINED")); return; } caddy = OBJ_NEW(orte_state_caddy_t); if (NULL != jdata) { caddy->jdata = jdata; caddy->job_state = state; OBJ_RETAIN(jdata); } OPAL_OUTPUT_VERBOSE((1, orte_state_base_framework.framework_output, "%s ACTIVATING JOB %s STATE %s PRI %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(state), s->priority)); opal_event_set(orte_event_base, &caddy->ev, -1, OPAL_EV_WRITE, s->cbfunc, caddy); opal_event_set_priority(&caddy->ev, s->priority); opal_event_active(&caddy->ev, OPAL_EV_WRITE, 1); }
/* * Sequentially map the ranks according to the placement in the * specified hostfile */ static int orte_rmaps_seq_map(orte_job_t *jdata) { orte_job_map_t *map; orte_app_context_t *app; int i, n; orte_std_cntr_t j; opal_list_item_t *item; orte_node_t *node, *nd; seq_node_t *sq, *save=NULL, *seq;; orte_vpid_t vpid; orte_std_cntr_t num_nodes; int rc; opal_list_t default_seq_list; opal_list_t node_list, *seq_list, sq_list; orte_proc_t *proc; mca_base_component_t *c = &mca_rmaps_seq_component.base_version; char *hosts, *hstname, *sep, *eptr; FILE *fp; #if OPAL_HAVE_HWLOC opal_hwloc_resource_type_t rtype; #endif OPAL_OUTPUT_VERBOSE((1, orte_rmaps_base_framework.framework_output, "%s rmaps:seq called on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); /* this mapper can only handle initial launch * when seq mapping is desired - allow * restarting of failed apps */ if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s is being restarted - seq cannot map", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } if (NULL != jdata->map->req_mapper) { if (0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) { /* a mapper has been specified, and it isn't me */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using sequential mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } /* we need to process it */ goto process; } if (ORTE_MAPPING_SEQ != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) { /* I don't know how to do these - defer */ opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: job %s not using seq mapper", ORTE_JOBID_PRINT(jdata->jobid)); return ORTE_ERR_TAKE_NEXT_OPTION; } process: opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: mapping job %s", ORTE_JOBID_PRINT(jdata->jobid)); /* flag that I did the mapping */ if (NULL != jdata->map->last_mapper) { free(jdata->map->last_mapper); } jdata->map->last_mapper = strdup(c->mca_component_name); /* convenience def */ map = jdata->map; /* if there is a default hostfile, go and get its ordered list of nodes */ OBJ_CONSTRUCT(&default_seq_list, opal_list_t); if (NULL != orte_default_hostfile) { /* open the file */ fp = fopen(orte_default_hostfile, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; goto error; } while (NULL != (hstname = orte_getline(fp))) { if (0 == strlen(hstname)) { /* blank line - ignore */ continue; } sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } sq->hostname = strdup(hstname); opal_list_append(&default_seq_list, &sq->super); } fclose(fp); } /* start at the beginning... */ vpid = 0; jdata->num_procs = 0; if (0 < opal_list_get_size(&default_seq_list)) { save = (seq_node_t*)opal_list_get_first(&default_seq_list); } #if OPAL_HAVE_HWLOC /* default to LOGICAL processors */ if (orte_get_attribute(&jdata->attributes, ORTE_JOB_PHYSICAL_CPUIDS, NULL, OPAL_BOOL)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using PHYSICAL processors"); rtype = OPAL_HWLOC_PHYSICAL; } else { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using LOGICAL processors"); rtype = OPAL_HWLOC_LOGICAL; } #endif /* initialize all the nodes as not included in this job map */ for (j=0; j < orte_node_pool->size; j++) { if (NULL != (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } } /* cycle through the app_contexts, mapping them sequentially */ for(i=0; i < jdata->apps->size; i++) { if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) { continue; } /* dash-host trumps hostfile */ if (orte_get_attribute(&app->attributes, ORTE_APP_DASH_HOST, (void**)&hosts, OPAL_STRING)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using dash-host nodes on app %s", app->app); OBJ_CONSTRUCT(&node_list, opal_list_t); /* dash host entries cannot specify cpusets, so used the std function to retrieve the list */ if (ORTE_SUCCESS != (rc = orte_util_get_ordered_dash_host_list(&node_list, hosts))) { ORTE_ERROR_LOG(rc); free(hosts); goto error; } free(hosts); /* transfer the list to a seq_node_t list */ OBJ_CONSTRUCT(&sq_list, opal_list_t); while (NULL != (nd = (orte_node_t*)opal_list_remove_first(&node_list))) { sq = OBJ_NEW(seq_node_t); sq->hostname = strdup(nd->name); opal_list_append(&sq_list, &sq->super); OBJ_RELEASE(nd); } OBJ_DESTRUCT(&node_list); seq_list = &sq_list; } else if (orte_get_attribute(&app->attributes, ORTE_APP_HOSTFILE, (void**)&hosts, OPAL_STRING)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using hostfile %s nodes on app %s", hosts, app->app); OBJ_CONSTRUCT(&sq_list, opal_list_t); /* open the file */ fp = fopen(hosts, "r"); if (NULL == fp) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); rc = ORTE_ERR_NOT_FOUND; OBJ_DESTRUCT(&sq_list); goto error; } while (NULL != (hstname = orte_getline(fp))) { sq = OBJ_NEW(seq_node_t); if (NULL != (sep = strchr(hstname, ' '))) { *sep = '\0'; sep++; /* remove any trailing space */ eptr = sep + strlen(sep) - 1; while (eptr > sep && isspace(*eptr)) { eptr--; } *(eptr+1) = 0; sq->cpuset = strdup(sep); } sq->hostname = hstname; opal_list_append(&sq_list, &sq->super); } fclose(fp); free(hosts); seq_list = &sq_list; } else if (0 < opal_list_get_size(&default_seq_list)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: using default hostfile nodes on app %s", app->app); seq_list = &default_seq_list; } else { /* can't do anything - no nodes available! */ orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* check for nolocal and remove the head node, if required */ if (map->mapping & ORTE_MAPPING_NO_USE_LOCAL) { for (item = opal_list_get_first(seq_list); item != opal_list_get_end(seq_list); item = opal_list_get_next(item) ) { seq = (seq_node_t*)item; /* need to check ifislocal because the name in the * hostfile may not have been FQDN, while name returned * by gethostname may have been (or vice versa) */ if (opal_ifislocal(seq->hostname)) { opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: removing head node %s", seq->hostname); opal_list_remove_item(seq_list, item); OBJ_RELEASE(item); /* "un-retain" it */ } } } if (NULL == seq_list || 0 == (num_nodes = (orte_std_cntr_t)opal_list_get_size(seq_list))) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:no-available-resources", true); return ORTE_ERR_SILENT; } /* if num_procs wasn't specified, set it now */ if (0 == app->num_procs) { app->num_procs = num_nodes; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: setting num procs to %s for app %s", ORTE_VPID_PRINT(app->num_procs), app->app); } else if (num_nodes < app->num_procs) { orte_show_help("help-orte-rmaps-base.txt", "seq:not-enough-resources", true, app->num_procs, num_nodes); return ORTE_ERR_SILENT; } if (seq_list == &default_seq_list) { sq = save; } else { sq = (seq_node_t*)opal_list_get_first(seq_list); } for (n=0; n < app->num_procs; n++) { /* find this node on the global array - this is necessary so * that our mapping gets saved on that array as the objects * returned by the hostfile function are -not- on the array */ node = NULL; for (j=0; j < orte_node_pool->size; j++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, j))) { continue; } if (0 == strcmp(sq->hostname, node->name)) { break; } } if (NULL == node) { /* wasn't found - that is an error */ orte_show_help("help-orte-rmaps-seq.txt", "orte-rmaps-seq:resource-not-found", true, sq->hostname); rc = ORTE_ERR_SILENT; goto error; } /* ensure the node is in the map */ if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) { OBJ_RETAIN(node); opal_pointer_array_add(map->nodes, node); jdata->map->num_nodes++; ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED); } proc = orte_rmaps_base_setup_proc(jdata, node, i); if ((node->slots < (int)node->num_procs) || (0 < node->slots_max && node->slots_max < (int)node->num_procs)) { if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) { orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error", true, node->num_procs, app->app); rc = ORTE_ERR_SILENT; goto error; } /* flag the node as oversubscribed so that sched-yield gets * properly set */ ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED); } /* assign the vpid */ proc->name.vpid = vpid++; opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: assign proc %s to node %s for app %s", ORTE_VPID_PRINT(proc->name.vpid), sq->hostname, app->app); #if OPAL_HAVE_HWLOC { /* record the cpuset, if given */ if (NULL != sq->cpuset) { hwloc_cpuset_t bitmap; char *cpu_bitmap; if (NULL == node->topology) { /* not allowed - for sequential cpusets, we must have * the topology info */ orte_show_help("help-orte-rmaps-base.txt", "rmaps:no-topology", true, node->name); rc = ORTE_ERR_SILENT; goto error; } /* if we are using hwthreads as cpus and binding to hwthreads, then * we can just copy the cpuset across as it already specifies things * at that level */ if (opal_hwloc_use_hwthreads_as_cpus && OPAL_BIND_TO_HWTHREAD == OPAL_GET_BINDING_POLICY(opal_hwloc_binding_policy)) { cpu_bitmap = strdup(sq->cpuset); } else { /* setup the bitmap */ bitmap = hwloc_bitmap_alloc(); /* parse the slot_list to find the socket and core */ if (ORTE_SUCCESS != (rc = opal_hwloc_base_slot_list_parse(sq->cpuset, node->topology, rtype, bitmap))) { ORTE_ERROR_LOG(rc); hwloc_bitmap_free(bitmap); goto error; } /* note that we cannot set the proc locale to any specific object * as the slot list may have assigned it to more than one - so * leave that field NULL */ /* set the proc to the specified map */ hwloc_bitmap_list_asprintf(&cpu_bitmap, bitmap); hwloc_bitmap_free(bitmap); } orte_set_attribute(&proc->attributes, ORTE_PROC_CPU_BITMAP, ORTE_ATTR_GLOBAL, cpu_bitmap, OPAL_STRING); opal_output_verbose(5, orte_rmaps_base_framework.framework_output, "mca:rmaps:seq: binding proc %s to cpuset %s bitmap %s", ORTE_VPID_PRINT(proc->name.vpid), sq->cpuset, cpu_bitmap); /* we are going to bind to cpuset since the user is specifying the cpus */ OPAL_SET_BINDING_POLICY(jdata->map->binding, OPAL_BIND_TO_CPUSET); /* note that the user specified the mapping */ ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYUSER); ORTE_SET_MAPPING_DIRECTIVE(jdata->map->mapping, ORTE_MAPPING_GIVEN); /* cleanup */ free(cpu_bitmap); } else { hwloc_obj_t locale; /* assign the locale - okay for the topo to be null as * it just means it wasn't returned */ if (NULL != node->topology) { locale = hwloc_get_root_obj(node->topology); orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, locale, OPAL_PTR); } } } #endif /* add to the jdata proc array */ if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) { ORTE_ERROR_LOG(rc); goto error; } /* move to next node */ sq = (seq_node_t*)opal_list_get_next(&sq->super); } /** track the total number of processes we mapped */ jdata->num_procs += app->num_procs; /* cleanup the node list if it came from this app_context */ if (seq_list != &default_seq_list) { OPAL_LIST_DESTRUCT(seq_list); } else { save = sq; } } return ORTE_SUCCESS; error: OPAL_LIST_DESTRUCT(&default_seq_list); return rc; }
int orte_util_get_ordered_host_list(opal_list_t *nodes, char *hostfile) { opal_list_t exclude; opal_list_item_t *item, *itm, *item2, *item1; char *cptr; int num_empty, i, nodeidx, startempty=0; bool want_all_empty=false; orte_node_t *node_from_pool, *newnode; int rc; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s hostfile: creating ordered list of hosts from hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); OBJ_CONSTRUCT(&exclude, opal_list_t); /* parse the hostfile and add the contents to the list, keeping duplicates */ if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, nodes, &exclude, true))) { goto cleanup; } /* parse the nodes to process any relative node directives */ item2 = opal_list_get_first(nodes); while (item2 != opal_list_get_end(nodes)) { orte_node_t *node=(orte_node_t*)item2; /* save the next location in case this one gets removed */ item1 = opal_list_get_next(item2); if ('+' != node->name[0]) { item2 = item1; continue; } /* see if we specified empty nodes */ if ('e' == node->name[1] || 'E' == node->name[1]) { /* request for empty nodes - do they want * all of them? */ if (NULL != (cptr = strchr(node->name, ':'))) { /* the colon indicates a specific # are requested */ cptr++; /* step past : */ num_empty = strtol(cptr, NULL, 10); } else { /* want them all - set num_empty to max */ num_empty = INT_MAX; want_all_empty = true; } /* insert empty nodes into newnodes list in place of the current item. * since item1 is the next item, we insert in front of it */ if (!orte_hnp_is_allocated && 0 == startempty) { startempty = 1; } for (i=startempty; 0 < num_empty && i < orte_node_pool->size; i++) { if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, i))) { continue; } if (0 == node_from_pool->slots_inuse) { newnode = OBJ_NEW(orte_node_t); newnode->name = strdup(node_from_pool->name); /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node->slots < node_from_pool->slots) { newnode->slots_alloc = node->slots; } else { newnode->slots_alloc = node_from_pool->slots; } opal_list_insert_pos(nodes, item1, &newnode->super); /* track number added */ --num_empty; } } /* bookmark where we stopped in case they ask for more */ startempty = i; /* did they get everything they wanted? */ if (!want_all_empty && 0 < num_empty) { orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty", true, num_empty); rc = ORTE_ERR_SILENT; goto cleanup; } /* since we have expanded the provided node, remove * it from list */ opal_list_remove_item(nodes, item2); OBJ_RELEASE(item2); } else if ('n' == node->name[1] || 'N' == node->name[1]) { /* they want a specific relative node #, so * look it up on global pool */ nodeidx = strtol(&node->name[2], NULL, 10); /* if the HNP is not allocated, then we need to * adjust the index as the node pool is offset * by one */ if (!orte_hnp_is_allocated) { nodeidx++; } /* see if that location is filled */ if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) { /* this is an error */ orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found", true, nodeidx, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* create the node object */ newnode = OBJ_NEW(orte_node_t); newnode->name = strdup(node_from_pool->name); /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node->slots < node_from_pool->slots) { newnode->slots_alloc = node->slots; } else { newnode->slots_alloc = node_from_pool->slots; } /* insert it before item1 */ opal_list_insert_pos(nodes, item1, &newnode->super); /* since we have expanded the provided node, remove * it from list */ opal_list_remove_item(nodes, item2); OBJ_RELEASE(item2); } else { /* invalid relative node syntax */ orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax", true, node->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* move to next */ item2 = item1; } /* remove from the list of nodes those that are in the exclude list */ while(NULL != (item = opal_list_remove_first(&exclude))) { orte_node_t *exnode = (orte_node_t*)item; /* check for matches on nodes */ for (itm = opal_list_get_first(nodes); itm != opal_list_get_end(nodes); itm = opal_list_get_next(itm)) { orte_node_t *node=(orte_node_t*)itm; if (0 == strcmp(exnode->name, node->name)) { /* match - remove it */ opal_list_remove_item(nodes, itm); OBJ_RELEASE(itm); /* have to cycle through the entire list as we could * have duplicates */ } } OBJ_RELEASE(item); } cleanup: OBJ_DESTRUCT(&exclude); return rc; }
/* Parse the provided hostfile and filter the nodes that are * on the input list, removing those that * are not found in the hostfile */ int orte_util_filter_hostfile_nodes(opal_list_t *nodes, char *hostfile) { opal_list_t newnodes, exclude; opal_list_item_t *item1, *item2, *next, *item3; orte_node_t *node_from_list, *node_from_file, *node_from_pool, *node3; int rc = ORTE_SUCCESS; char *cptr; int num_empty, nodeidx; bool want_all_empty = false; opal_list_t keep; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s hostfile: filtering nodes through hostfile %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostfile)); /* parse the hostfile and create local list of findings */ OBJ_CONSTRUCT(&newnodes, opal_list_t); OBJ_CONSTRUCT(&exclude, opal_list_t); if (ORTE_SUCCESS != (rc = hostfile_parse(hostfile, &newnodes, &exclude, false))) { OBJ_DESTRUCT(&newnodes); return rc; } /* remove from the list of newnodes those that are in the exclude list * since we could have added duplicate names above due to the */ while (NULL != (item1 = opal_list_remove_first(&exclude))) { node_from_file = (orte_node_t*)item1; /* check for matches on nodes */ for (item2 = opal_list_get_first(&newnodes); item2 != opal_list_get_end(&newnodes); item2 = opal_list_get_next(item2)) { orte_node_t *node = (orte_node_t*)item2; if (0 == strcmp(node_from_file->name, node->name)) { /* match - remove it */ opal_list_remove_item(&newnodes, item2); OBJ_RELEASE(item2); break; } } OBJ_RELEASE(item1); } /* now check our nodes and keep those that match. We can * destruct our hostfile list as we go since this won't be needed */ OBJ_CONSTRUCT(&keep, opal_list_t); while (NULL != (item2 = opal_list_remove_first(&newnodes))) { node_from_file = (orte_node_t*)item2; next = opal_list_get_next(item2); /* see if this is a relative node syntax */ if ('+' == node_from_file->name[0]) { /* see if we specified empty nodes */ if ('e' == node_from_file->name[1] || 'E' == node_from_file->name[1]) { /* request for empty nodes - do they want * all of them? */ if (NULL != (cptr = strchr(node_from_file->name, ':'))) { /* the colon indicates a specific # are requested */ cptr++; /* step past : */ num_empty = strtol(cptr, NULL, 10); } else { /* want them all - set num_empty to max */ num_empty = INT_MAX; want_all_empty = true; } /* search the list of nodes provided to us and find those * that are empty */ item1 = opal_list_get_first(nodes); while (0 < num_empty && item1 != opal_list_get_end(nodes)) { node_from_list = (orte_node_t*)item1; next = opal_list_get_next(item1); /* keep our place */ if (0 == node_from_list->slots_inuse) { /* check to see if this node is explicitly called * out later - if so, don't use it here */ for (item3 = opal_list_get_first(&newnodes); item3 != opal_list_get_end(&newnodes); item3 = opal_list_get_next(item3)) { node3 = (orte_node_t*)item3; if (0 == strcmp(node3->name, node_from_list->name)) { /* match - don't use it */ goto skipnode; } } /* remove item from list */ opal_list_remove_item(nodes, item1); /* xfer to keep list */ opal_list_append(&keep, item1); --num_empty; } skipnode: item1 = next; } /* did they get everything they wanted? */ if (!want_all_empty && 0 < num_empty) { orte_show_help("help-hostfile.txt", "hostfile:not-enough-empty", true, num_empty); rc = ORTE_ERR_SILENT; goto cleanup; } } else if ('n' == node_from_file->name[1] || 'N' == node_from_file->name[1]) { /* they want a specific relative node #, so * look it up on global pool */ nodeidx = strtol(&node_from_file->name[2], NULL, 10); if (NULL == (node_from_pool = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, nodeidx))) { /* this is an error */ orte_show_help("help-hostfile.txt", "hostfile:relative-node-not-found", true, nodeidx, node_from_file->name); rc = ORTE_ERR_SILENT; goto cleanup; } /* search the list of nodes provided to us and find it */ for (item1 = opal_list_get_first(nodes); item1 != opal_list_get_end(nodes); item1 = opal_list_get_next(nodes)) { node_from_list = (orte_node_t*)item1; if (0 == strcmp(node_from_list->name, node_from_pool->name)) { /* match - remove item from list */ opal_list_remove_item(nodes, item1); /* xfer to keep list */ opal_list_append(&keep, item1); break; } } } else { /* invalid relative node syntax */ orte_show_help("help-hostfile.txt", "hostfile:invalid-relative-node-syntax", true, node_from_file->name); rc = ORTE_ERR_SILENT; goto cleanup; } } else { /* we are looking for a specific node on the list * search the provided list of nodes to see if this * one is found */ for (item1 = opal_list_get_first(nodes); item1 != opal_list_get_end(nodes); item1 = opal_list_get_next(item1)) { node_from_list = (orte_node_t*)item1; /* since the name in the hostfile might not match * our local name, and yet still be intended to match, * we have to check for local interfaces */ if (0 == strcmp(node_from_file->name, node_from_list->name) || (opal_ifislocal(node_from_list->name) && opal_ifislocal(node_from_file->name))) { /* if the slot count here is less than the * total slots avail on this node, set it * to the specified count - this allows people * to subdivide an allocation */ if (node_from_file->slots < node_from_list->slots) { node_from_list->slots_alloc = node_from_file->slots; } /* remove the node from the list */ opal_list_remove_item(nodes, item1); /* xfer it to keep list */ opal_list_append(&keep, item1); break; } } } /* cleanup the newnode list */ OBJ_RELEASE(item2); } /* if we still have entries on our hostfile list, then * there were requested hosts that were not in our allocation. * This is an error - report it to the user and return an error */ if (0 != opal_list_get_size(&newnodes)) { orte_show_help("help-hostfile.txt", "not-all-mapped-alloc", true, hostfile); while (NULL != (item1 = opal_list_remove_first(&newnodes))) { OBJ_RELEASE(item1); } OBJ_DESTRUCT(&newnodes); return ORTE_ERR_SILENT; } /* clear the rest of the nodes list */ while (NULL != (item1 = opal_list_remove_first(nodes))) { OBJ_RELEASE(item1); } /* the nodes list has been cleared - rebuild it in order */ while (NULL != (item1 = opal_list_remove_first(&keep))) { opal_list_append(nodes, item1); } cleanup: OBJ_DESTRUCT(&newnodes); return rc; }
void orte_grpcomm_base_progress_collectives(void) { opal_list_item_t *item; orte_grpcomm_collective_t *coll; orte_namelist_t *nm; orte_job_t *jdata; opal_buffer_t *relay; int rc; /* cycle thru all known collectives - any collective on the list * must have come from either a local proc or receiving a global * collective. Either way, the number of required recipients * is the number of local procs for that job */ item = opal_list_get_first(&orte_grpcomm_base.active_colls); while (item != opal_list_get_end(&orte_grpcomm_base.active_colls)) { coll = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s PROGRESSING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); /* if this collective is already locally complete, then ignore it */ if (coll->locally_complete) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s COLL %d IS LOCALLY COMPLETE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); goto next_coll; } /* get the jobid of the participants in this collective */ if (NULL == (nm = (orte_namelist_t*)opal_list_get_first(&coll->participants))) { opal_output(0, "NO PARTICIPANTS"); goto next_coll; } /* get the job object for this participant */ if (NULL == (jdata = orte_get_job_data_object(nm->name.jobid))) { /* if the job object isn't found, then we can't progress * this collective */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s COLL %d JOBID %s NOT FOUND", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id, ORTE_JOBID_PRINT(nm->name.jobid))); goto next_coll; } /* all local procs from this job are required to participate */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s ALL LOCAL PROCS FOR JOB %s CONTRIBUTE %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), (int)jdata->num_local_procs)); /* see if all reqd participants are done */ if (jdata->num_local_procs == coll->num_local_recvd) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s COLLECTIVE %d LOCALLY COMPLETE - SENDING TO GLOBAL COLLECTIVE", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); /* mark it as locally complete */ coll->locally_complete = true; /* pack the collective */ relay = OBJ_NEW(opal_buffer_t); orte_grpcomm_base_pack_collective(relay, jdata->jobid, coll, ORTE_GRPCOMM_INTERNAL_STG_LOCAL); /* send it to our global collective handler */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_NAME, relay, ORTE_RML_TAG_DAEMON_COLL, 0, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(relay); } } next_coll: item = opal_list_get_next(item); } }
static void track_procs(int fd, short argc, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_process_name_t *proc = &caddy->name; orte_proc_state_t state = caddy->proc_state; orte_job_t *jdata; orte_proc_t *pdata, *pptr; opal_buffer_t *alert; int rc, i; orte_plm_cmd_flag_t cmd; int8_t flag; OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm:track_procs called for proc %s state %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(proc), orte_proc_state_to_str(state))); /* get the job object for this proc */ if (NULL == (jdata = orte_get_job_data_object(proc->jobid))) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } pdata = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, proc->vpid); if (NULL == pdata) { ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND); goto cleanup; } if (ORTE_PROC_STATE_RUNNING == state) { /* update the proc state */ pdata->state = state; jdata->num_launched++; /* don't update until we are told that all are done */ } else if (ORTE_PROC_STATE_REGISTERED == state) { /* update the proc state */ pdata->state = state; jdata->num_reported++; if (jdata->num_reported == jdata->num_local_procs) { /* once everyone registers, send their contact info to * the HNP so it is available to debuggers and anyone * else that needs it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: notifying HNP all local registered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); alert = OBJ_NEW(opal_buffer_t); /* pack registered command */ cmd = ORTE_PLM_REGISTERED_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the jobid */ if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &proc->jobid, 1, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack all the local child vpids */ for (i=0; i < orte_local_children->size; i++) { if (NULL == (pptr = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) { continue; } if (pptr->name.jobid == proc->jobid) { if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &pptr->name.vpid, 1, ORTE_VPID))) { ORTE_ERROR_LOG(rc); goto cleanup; } if (ORTE_FLAG_TEST(pptr, ORTE_PROC_FLAG_AS_MPI)) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &flag, 1, OPAL_INT8))) { ORTE_ERROR_LOG(rc); goto cleanup; } } } /* send it */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } else { rc = ORTE_SUCCESS; } } } else if (ORTE_PROC_STATE_IOF_COMPLETE == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_IOF_COMPLETE); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_WAITPID) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_WAITPID_FIRED == state) { /* do NOT update the proc state as this can hit * while we are still trying to notify the HNP of * successful launch for short-lived procs */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_WAITPID); if (ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_IOF_COMPLETE) && !ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { ORTE_ACTIVATE_PROC_STATE(proc, ORTE_PROC_STATE_TERMINATED); } } else if (ORTE_PROC_STATE_TERMINATED == state) { /* if this proc has not already recorded as terminated, then * update the accounting here */ if (!ORTE_FLAG_TEST(pdata, ORTE_PROC_FLAG_RECORDED)) { jdata->num_terminated++; } /* update the proc state */ ORTE_FLAG_SET(pdata, ORTE_PROC_FLAG_RECORDED); ORTE_FLAG_UNSET(pdata, ORTE_PROC_FLAG_ALIVE); pdata->state = state; /* Clean up the session directory as if we were the process * itself. This covers the case where the process died abnormally * and didn't cleanup its own session directory. */ orte_session_dir_finalize(proc); /* track job status */ if (jdata->num_terminated == jdata->num_local_procs && !orte_get_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, NULL, OPAL_BOOL)) { /* pack update state command */ cmd = ORTE_PLM_UPDATE_PROC_STATE; alert = OBJ_NEW(opal_buffer_t); if (ORTE_SUCCESS != (rc = opal_dss.pack(alert, &cmd, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); goto cleanup; } /* pack the job info */ if (ORTE_SUCCESS != (rc = pack_state_update(alert, jdata))) { ORTE_ERROR_LOG(rc); } /* send it */ OPAL_OUTPUT_VERBOSE((5, orte_state_base_framework.framework_output, "%s state:orcm: SENDING JOB LOCAL TERMINATION UPDATE FOR JOB %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid))); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, alert, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); } /* mark that we sent it so we ensure we don't do it again */ orte_set_attribute(&jdata->attributes, ORTE_JOB_TERM_NOTIFIED, ORTE_ATTR_LOCAL, NULL, OPAL_BOOL); } } cleanup: OBJ_RELEASE(caddy); }
static void daemon_coll_recv(int status, orte_process_name_t* sender, opal_buffer_t* data, orte_rml_tag_t tag, void* cbdata) { orte_job_t *jdata; orte_std_cntr_t n; opal_list_item_t *item; orte_vpid_t np; int rc; orte_grpcomm_collective_t *coll; orte_namelist_t *nm; orte_grpcomm_coll_id_t id; bool do_progress; opal_buffer_t *relay; orte_jobid_t jobid; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: daemon collective recvd from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); /* get the collective id */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: WORKING COLLECTIVE %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id)); /* setup the collective for this id - if it's already present, * then this will just return the existing structure */ coll = orte_grpcomm_base_setup_collective(id); /* record that we received a bucket */ coll->num_peer_buckets++; /* unpack the jobid */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &jobid, &n, ORTE_JOBID))) { ORTE_ERROR_LOG(rc); return; } /* find this job */ do_progress = true; if (NULL == (jdata = orte_get_job_data_object(jobid))) { /* if we can't find it, then we haven't processed the * launch msg for this job yet - can't happen with * our own local procs, but this could involve a proc * running remotely that we don't know about yet */ do_progress = false; } if (do_progress && 0 == jdata->num_local_procs) { coll->locally_complete = true; } /* unpack the number of contributors involved in the incoming data */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(data, &np, &n, ORTE_VPID))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: NUM CONTRIBS: %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(np))); /* add it to the number of global recvd */ coll->num_global_recvd += np; /* transfer the data */ opal_dss.copy_payload(&coll->buffer, data); /* are we done? */ if (!do_progress || !coll->locally_complete) { /* can't continue - missing at least one launch msg * or not locally complete */ OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: CANNOT PROGRESS", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return; } /* determine how many buckets we should receive from others * involved in this collective - need to know the number * of total contributors from all buckets being relayed * thru us */ orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_PEERS, coll); np = 1; /* account for our own bucket */ while (NULL != (item = opal_list_remove_first(&coll->targets))) { nm = (orte_namelist_t*)item; if (ORTE_VPID_WILDCARD == nm->name.vpid) { /* wait for input from all daemons */ np = orte_process_info.num_procs; break; } else { np++; } } /* clear the list for reuse */ while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { OBJ_RELEASE(nm); } /* relay the data, if required */ if (np == coll->num_peer_buckets) { orte_routed.get_routing_list(ORTE_GRPCOMM_COLL_RELAY, coll); while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: RELAYING COLLECTIVE TO %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name))); relay = OBJ_NEW(opal_buffer_t); orte_grpcomm_base_pack_collective(relay, jobid, coll, ORTE_GRPCOMM_INTERNAL_STG_GLOBAL); if (ORTE_VPID_WILDCARD == nm->name.vpid) { /* this is going to everyone in this job, so use xcast */ orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_DAEMON_COLL); OBJ_RELEASE(relay); } /* otherwise, send to each member, but don't send it back to the * sender as that can create an infinite loop */ if (nm->name.vpid == sender->vpid) { OBJ_RELEASE(relay); } else { if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_DAEMON_COLL, 0, orte_rml_send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay); } } OBJ_RELEASE(nm); } } /* clear the list for reuse */ while (NULL != (nm = (orte_namelist_t*)opal_list_remove_first(&coll->targets))) { OBJ_RELEASE(nm); } /* determine how many contributors we need to recv - we know * that all job objects were found, so we can skip that test * while counting */ np = 0; for (item = opal_list_get_first(&coll->participants); item != opal_list_get_end(&coll->participants); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; /* get the job object for this participant */ jdata = orte_get_job_data_object(nm->name.jobid); if (ORTE_VPID_WILDCARD == nm->name.vpid) { /* all procs from this job are required to participate */ np += jdata->num_procs; } else { np++; } } /* are we done? */ if (np != coll->num_global_recvd) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:daemon_coll: MISSING CONTRIBUTORS: np %s ngr %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_VPID_PRINT(np), ORTE_VPID_PRINT(coll->num_global_recvd))); return; } /* since we discovered that the collective is complete, we * need to send it to all the participants */ for (item = opal_list_get_first(&coll->participants); item != opal_list_get_end(&coll->participants); item = opal_list_get_next(item)) { nm = (orte_namelist_t*)item; relay = OBJ_NEW(opal_buffer_t); opal_dss.pack(relay, &coll->id, 1, ORTE_GRPCOMM_COLL_ID_T); opal_dss.copy_payload(relay, &coll->buffer); /* if the vpid is wildcard, then this goes to * all daemons for relay */ if (ORTE_VPID_WILDCARD == nm->name.vpid) { orte_grpcomm.xcast(nm->name.jobid, relay, ORTE_RML_TAG_COLLECTIVE); OBJ_RELEASE(relay); } else { /* send it to this proc */ if (0 > orte_rml.send_buffer_nb(&nm->name, relay, ORTE_RML_TAG_COLLECTIVE, 0, orte_rml_send_callback, NULL)) { ORTE_ERROR_LOG(ORTE_ERR_COMM_FAILURE); OBJ_RELEASE(relay); } } } /* remove this collective */ opal_list_remove_item(&orte_grpcomm_base.active_colls, &coll->super); OBJ_RELEASE(coll); }
void orte_iof_orted_read_handler(int fd, short event, void *cbdata) { orte_iof_read_event_t *rev = (orte_iof_read_event_t*)cbdata; unsigned char data[ORTE_IOF_BASE_MSG_MAX]; opal_buffer_t *buf=NULL; int rc; int32_t numbytes; opal_list_item_t *item; orte_iof_proc_t *proct; orte_ns_cmp_bitmask_t mask; /* read up to the fragment size */ #if !defined(__WINDOWS__) numbytes = read(fd, data, sizeof(data)); #else { DWORD readed; HANDLE handle = (HANDLE)_get_osfhandle(fd); ReadFile(handle, data, sizeof(data), &readed, NULL); numbytes = (int)readed; } #endif /* !defined(__WINDOWS__) */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler read %d bytes from %s, fd %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes, ORTE_NAME_PRINT(&rev->name), fd)); if (numbytes <= 0) { if (0 > numbytes) { /* either we have a connection error or it was a non-blocking read */ if (EAGAIN == errno || EINTR == errno) { /* non-blocking, retry */ opal_event_add(rev->ev, 0); return; } OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler %s Error on connection:%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&rev->name), fd)); } /* numbytes must have been zero, so go down and close the fd etc */ goto CLEAN_RETURN; } /* see if the user wanted the output directed to files */ if (NULL != orte_output_filename) { /* find the sink for this rank */ for (item = opal_list_get_first(&mca_iof_orted_component.sinks); item != opal_list_get_end(&mca_iof_orted_component.sinks); item = opal_list_get_next(item)) { orte_iof_sink_t *sink = (orte_iof_sink_t*)item; /* if the target is set, then this sink is for another purpose - ignore it */ if (ORTE_JOBID_INVALID != sink->daemon.jobid) { continue; } /* if this sink isn't for output, ignore it */ if (ORTE_IOF_STDIN & sink->tag) { continue; } mask = ORTE_NS_CMP_ALL; /* is this the desired proc? */ if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &sink->name, &rev->name)) { /* output to the corresponding file */ orte_iof_base_write_output(&rev->name, rev->tag, data, numbytes, sink->wev); /* done */ break; } } goto RESTART; } /* prep the buffer */ buf = OBJ_NEW(opal_buffer_t); /* pack the stream first - we do this so that flow control messages can * consist solely of the tag */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->tag, 1, ORTE_IOF_TAG))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack name of process that gave us this data */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &rev->name, 1, ORTE_NAME))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* pack the data - only pack the #bytes we read! */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &data, numbytes, OPAL_BYTE))) { ORTE_ERROR_LOG(rc); goto CLEAN_RETURN; } /* start non-blocking RML call to forward received data */ OPAL_OUTPUT_VERBOSE((1, orte_iof_base_framework.framework_output, "%s iof:orted:read handler sending %d bytes to HNP", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), numbytes)); orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_IOF_HNP, send_cb, NULL); RESTART: /* re-add the event */ opal_event_add(rev->ev, 0); return; CLEAN_RETURN: /* must be an error, or zero bytes were read indicating that the * proc terminated this IOF channel - either way, find this proc * on our list and clean up */ for (item = opal_list_get_first(&mca_iof_orted_component.procs); item != opal_list_get_end(&mca_iof_orted_component.procs); item = opal_list_get_next(item)) { proct = (orte_iof_proc_t*)item; mask = ORTE_NS_CMP_ALL; if (OPAL_EQUAL == orte_util_compare_name_fields(mask, &proct->name, &rev->name)) { /* found it - release corresponding event. This deletes * the read event and closes the file descriptor */ if (rev->tag & ORTE_IOF_STDOUT) { if( NULL != proct->revstdout ) { OBJ_RELEASE(proct->revstdout); } } else if (rev->tag & ORTE_IOF_STDERR) { if( NULL != proct->revstderr ) { OBJ_RELEASE(proct->revstderr); } } else if (rev->tag & ORTE_IOF_STDDIAG) { if( NULL != proct->revstddiag ) { OBJ_RELEASE(proct->revstddiag); } } /* check to see if they are all done */ if (NULL == proct->revstdout && NULL == proct->revstderr && NULL == proct->revstddiag) { /* this proc's iof is complete */ opal_list_remove_item(&mca_iof_orted_component.procs, item); ORTE_ACTIVATE_PROC_STATE(&proct->name, ORTE_PROC_STATE_IOF_COMPLETE); OBJ_RELEASE(proct); } break; } } if (NULL != buf) { OBJ_RELEASE(buf); } return; }
/** * this routine performs a test that indicates whether or not posix shared * memory can safely be used during this run. * note: that we want to run this test as few times as possible. * * @return OPAL_SUCCESS when posix can safely be used. */ static int posix_runtime_query(mca_base_module_t **module, int *priority, const char *hint) { char tmp_buff[OPAL_SHMEM_POSIX_FILE_LEN_MAX]; int fd = -1; *priority = 0; *module = NULL; /* if hint isn't null, then someone else already figured out who is the * best runnable component is AND the caller is relaying that info so we * don't have to perform a run-time query. */ if (NULL != hint) { OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, "shmem: posix: runtime_query: " "attempting to use runtime hint (%s)\n", hint) ); /* was i selected? if so, then we are done. * otherwise, disqualify myself. */ if (0 == strcasecmp(hint, mca_shmem_posix_component.super.base_version.mca_component_name)) { *priority = mca_shmem_posix_component.priority; *module = (mca_base_module_t *)&opal_shmem_posix_module.super; return OPAL_SUCCESS; } else { *priority = 0; *module = NULL; return OPAL_SUCCESS; } } /* if we are here, then perform a run-time query because we didn't get a * hint. it's either up to us to figure it out, or the caller wants us to * re-run the runtime query. */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_output, "shmem: posix: runtime_query: NO HINT PROVIDED:" "starting run-time test...\n") ); /* shmem_posix_shm_open successfully shm_opened - we can use posix sm! */ if (-1 != (fd = shmem_posix_shm_open(tmp_buff, OPAL_SHMEM_POSIX_FILE_LEN_MAX -1))) { /* free up allocated resources before we return */ if (0 != shm_unlink(tmp_buff)) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-posix.txt", "sys call fail", 1, hn, "shm_unlink(2)", "", strerror(err), err); /* something strange happened, so consider this a run-time test * failure even though shmem_posix_shm_open was successful */ } /* all is well */ else { *priority = mca_shmem_posix_component.priority; *module = (mca_base_module_t *)&opal_shmem_posix_module.super; rt_successful = true; } } return OPAL_SUCCESS; }
int orte_ess_base_app_setup(bool db_restrict_local) { int ret; char *error = NULL; opal_value_t kv; /* * stdout/stderr buffering * If the user requested to override the default setting then do * as they wish. */ if( orte_ess_base_std_buffering > -1 ) { if( 0 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); } else if( 1 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); } else if( 2 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOFBF, 0); setvbuf(stderr, NULL, _IOFBF, 0); } } /* if I am an MPI app, we will let the MPI layer define and * control the opal_proc_t structure. Otherwise, we need to * do so here */ if (ORTE_PROC_NON_MPI) { orte_process_info.super.proc_name = *(opal_process_name_t*)ORTE_PROC_MY_NAME; orte_process_info.super.proc_hostname = strdup(orte_process_info.nodename); orte_process_info.super.proc_flags = OPAL_PROC_ALL_LOCAL; orte_process_info.super.proc_arch = opal_local_arch; opal_proc_local_set(&orte_process_info.super); } /* get an async event base - we use the opal_async one so * we don't startup extra threads if not needed */ orte_event_base = opal_start_progress_thread("opal_async", true); progress_thread_running = true; /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_state_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_errmgr_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_framework.framework_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); /* store the session directory location in the database */ OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_DSTORE_JOB_SDIR); kv.type = OPAL_STRING; kv.data.string = strdup(orte_process_info.job_session_dir); if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, ORTE_PROC_MY_NAME, &kv))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); error = "opal dstore store"; goto error; } OBJ_DESTRUCT(&kv); OBJ_CONSTRUCT(&kv, opal_value_t); kv.key = strdup(OPAL_DSTORE_MY_SDIR); kv.type = OPAL_STRING; kv.data.string = strdup(orte_process_info.proc_session_dir); if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, ORTE_PROC_MY_NAME, &kv))) { ORTE_ERROR_LOG(ret); OBJ_DESTRUCT(&kv); error = "opal dstore store"; goto error; } OBJ_DESTRUCT(&kv); } /* Setup the communication infrastructure */ /* * OOB Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_oob_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_oob_base_select"; goto error; } /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* Messaging QoS Layer */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_qos_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_qos_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_qos_base_select"; goto error; } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* non-daemon/HNP apps can only have the default proxy PLM * module open - provide a chance for it to initialize */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup the routed info */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_snapc_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_sstore_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, ORTE_PROC_IS_APP))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } if (ORTE_SUCCESS != (ret = orte_sstore_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_sstore_base_select"; goto error; } /* apps need the OPAL CR stuff */ opal_cr_set_enabled(true); #else opal_cr_set_enabled(false); #endif /* Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* open the distributed file system */ if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_dfs_base_framework, 0))) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_dfs_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_dfs_base_select"; goto error; } return ORTE_SUCCESS; error: if (!progress_thread_running) { /* can't send the help message, so ensure it * comes out locally */ orte_show_help_finalize(); } orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
/* /!\ Called for each processes /!\ */ static int portals4_init_query(bool enable_progress_threads, bool enable_mpi_threads) { int ret; ptl_md_t md; ptl_me_t me; /* Initialize Portals and create a physical, matching interface */ ret = PtlInit(); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlInit failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlNIInit(PTL_IFACE_DEFAULT, PTL_NI_PHYSICAL | PTL_NI_MATCHING, PTL_PID_ANY, NULL, &mca_coll_portals4_component.ni_limits, &mca_coll_portals4_component.ni_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlNIInit failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ni_limits.max_atomic_size=%ld", mca_coll_portals4_component.ni_limits.max_atomic_size); if (mca_coll_portals4_component.portals_max_msg_size < mca_coll_portals4_component.ni_limits.max_msg_size) mca_coll_portals4_component.ni_limits.max_msg_size = mca_coll_portals4_component.portals_max_msg_size; opal_output_verbose(10, ompi_coll_base_framework.framework_output, "ni_limits.max_msg_size=%lu", mca_coll_portals4_component.ni_limits.max_msg_size); ret = PtlGetId(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.id); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlGetid failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* FIX ME: Need to make sure our ID matches with the MTL... */ ret = PtlGetUid(mca_coll_portals4_component.ni_h, &mca_coll_portals4_component.uid); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlGetUid failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlEQAlloc(mca_coll_portals4_component.ni_h, MCA_COLL_PORTALS4_EQ_SIZE, &mca_coll_portals4_component.eq_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlEQAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, 0, mca_coll_portals4_component.eq_h, REQ_COLL_TABLE_ID, &mca_coll_portals4_component.pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } if (mca_coll_portals4_component.pt_idx != REQ_COLL_TABLE_ID) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", __FILE__, __LINE__, mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } ret = PtlPTAlloc(mca_coll_portals4_component.ni_h, 0, mca_coll_portals4_component.eq_h, REQ_COLL_FINISH_TABLE_ID, &mca_coll_portals4_component.finish_pt_idx); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } if (mca_coll_portals4_component.finish_pt_idx != REQ_COLL_FINISH_TABLE_ID) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlPTAlloc return wrong pt_idx: %d\n", __FILE__, __LINE__, mca_coll_portals4_component.finish_pt_idx); return OMPI_ERROR; } /* Bind MD/MDs across all memory. We prefer (for obvious reasons) to have a single MD across all of memory */ memset(&md, 0, sizeof(ptl_md_t)); md.start = 0; md.length = 0; md.options = 0; md.eq_handle = PTL_EQ_NONE; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(mca_coll_portals4_component.ni_h, &md, &mca_coll_portals4_component.zero_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } md.start = 0; md.length = PTL_SIZE_MAX; md.options = 0; md.eq_handle = PTL_EQ_NONE; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(mca_coll_portals4_component.ni_h, &md, &mca_coll_portals4_component.data_md_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMDBind failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } OPAL_OUTPUT_VERBOSE((90, ompi_coll_base_framework.framework_output, "PtlMDBind start=%p length=%lx\n", md.start, md.length)); /* setup finish ack ME */ me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; me.match_bits = 0; me.ignore_bits = 0; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.finish_pt_idx, &me, PTL_PRIORITY_LIST, NULL, &mca_coll_portals4_component.finish_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* This ME is used for RTR exchange only */ me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.min_free = 0; me.uid = mca_coll_portals4_component.uid; me.options = PTL_ME_OP_PUT | PTL_ME_EVENT_SUCCESS_DISABLE | PTL_ME_EVENT_OVER_DISABLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE; me.match_id.phys.nid = PTL_NID_ANY; me.match_id.phys.pid = PTL_PID_ANY; /* Note : the RTR bit must be set to match this ME, * this allows to discriminate the RTR from data flow * (especially for the Barrier operations) */ COLL_PORTALS4_SET_BITS(me.match_bits, 0, 0, 1, 0, 0, 0); me.ignore_bits = ~COLL_PORTALS4_RTR_MASK; ret = PtlMEAppend(mca_coll_portals4_component.ni_h, mca_coll_portals4_component.pt_idx, &me, PTL_OVERFLOW_LIST, NULL, &mca_coll_portals4_component.unex_me_h); if (PTL_OK != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: PtlMEAppend of barrier unexpected failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } /* activate progress callback */ ret = opal_progress_register(portals4_progress); if (OMPI_SUCCESS != ret) { opal_output_verbose(1, ompi_coll_base_framework.framework_output, "%s:%d: opal_progress_register failed: %d\n", __FILE__, __LINE__, ret); return OMPI_ERROR; } return OMPI_SUCCESS; }
void orte_data_server(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_data_server_cmd_t command; orte_std_cntr_t count; char *service_name, *port_name; orte_data_object_t *data; opal_buffer_t *answer; int rc, ret; bool unique; OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server got message from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(sender))); count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &command, &count, ORTE_DATA_SERVER_CMD))) { ORTE_ERROR_LOG(rc); return; } answer = OBJ_NEW(opal_buffer_t); switch(command) { case ORTE_DATA_SERVER_PUBLISH: /* unpack the service name */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &service_name, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto SEND_ERROR; } /* unpack the port name */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &port_name, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto SEND_ERROR; } /* unpack uniqueness flag */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &unique, &count, OPAL_BOOL))) { ORTE_ERROR_LOG(rc); goto SEND_ERROR; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: publishing service %s port %s %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, port_name, unique ? "UNIQUE" : "OVERWRITE")); /* check the current data store to see if this service name has already * been published */ if (NULL != (data = lookup(service_name))) { /* already exists - see if overwrite allowed */ if (unique) { /* return ORTE_EXISTS error code */ OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: publishing service %s port %s already exists", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, port_name)); ret = ORTE_EXISTS; } else { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: overwriting service %s with port %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, port_name)); if (NULL != data->port) { free(data->port); } data->port = port_name; data->owner.jobid = sender->jobid; data->owner.vpid = sender->vpid; ret = ORTE_SUCCESS; } if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; } /* create a new data object */ data = OBJ_NEW(orte_data_object_t); /* pass over the data values - these were malloc'd when unpacked, * so we don't need to strdup them here */ data->service = service_name; data->port = port_name; data->owner.jobid = sender->jobid; data->owner.vpid = sender->vpid; /* store the data */ data->index = opal_pointer_array_add(orte_data_server_store, data); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: successfully published service %s port %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, port_name)); /* tell the user it was wonderful... */ ret = ORTE_SUCCESS; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; break; case ORTE_DATA_SERVER_LOOKUP: /* unpack the service name */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &service_name, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto SEND_ERROR; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: lookup on service %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name)); /* locate this record in the data store */ if (NULL == (data = lookup(service_name))) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: service %s not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name)); /* return ORTE_ERR_NOT_FOUND error code */ ret = ORTE_ERR_NOT_FOUND; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: successful lookup on service %s port %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, data->port)); /* pack success so the unpack on the other end can * always unpack an int first */ ret = ORTE_SUCCESS; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ goto SEND_ANSWER; } /* pack the returned port */ if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &data->port, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; break; case ORTE_DATA_SERVER_UNPUBLISH: /* unpack the service name */ count = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &service_name, &count, OPAL_STRING))) { ORTE_ERROR_LOG(rc); goto SEND_ERROR; } OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: unpublish on service %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name)); /* locate this record in the data store */ if (NULL == (data = lookup(service_name))) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: service %s not found", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name)); /* return ORTE_ERR_NOT_FOUND error code */ ret = ORTE_ERR_NOT_FOUND; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; } /* check to see if the sender owns it - must be exact match */ if (OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_ALL, &data->owner, sender)) { OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: service %s not owned by sender %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name, ORTE_NAME_PRINT(sender))); /* nope - return ORTE_ERR_PERM error code */ ret = ORTE_ERR_PERM; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; } /* delete the object from the data store */ opal_pointer_array_set_item(orte_data_server_store, data->index, NULL); OBJ_RELEASE(data); OPAL_OUTPUT_VERBOSE((1, orte_debug_output, "%s data server: service %s unpublished", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), service_name)); /* tell the sender this succeeded */ ret = ORTE_SUCCESS; if (ORTE_SUCCESS != (rc = opal_dss.pack(answer, &ret, 1, OPAL_INT))) { ORTE_ERROR_LOG(rc); /* if we can't pack it, we probably can't pack the * rc value either, so just send whatever is there */ } goto SEND_ANSWER; break; default: ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); rc = ORTE_ERR_BAD_PARAM; break; } SEND_ERROR: /* pack the error code */ if (ORTE_SUCCESS != (ret = opal_dss.pack(answer, &rc, 1, OPAL_INT))) { ORTE_ERROR_LOG(ret); } SEND_ANSWER: if (0 > (rc = orte_rml.send_buffer_nb(sender, answer, ORTE_RML_TAG_DATA_CLIENT, rml_cbfunc, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(answer); } }
/*** MODEX SECTION ***/ static int modex(orte_grpcomm_collective_t *coll) { char *cptr, **fields; orte_vpid_t v; orte_process_name_t name; int rc; opal_hwloc_locality_t locality; orte_local_rank_t local_rank; orte_node_rank_t node_rank; bool bound; OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi: modex entered", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our RTE data was constructed and pushed in the ESS pmi component */ /* commit our modex info */ opal_db.commit((opal_identifier_t *)ORTE_PROC_MY_NAME); /* cycle thru all my peers and collect their RTE info */ name.jobid = ORTE_PROC_MY_NAME->jobid; fields = NULL; for (v=0; v < orte_process_info.num_procs; v++) { if (v == ORTE_PROC_MY_NAME->vpid) { continue; } name.vpid = v; /* fetch the RTE data for this proc */ if (ORTE_SUCCESS != (rc = opal_db.fetch((opal_identifier_t*)&name, "RTE", (void **)&cptr, OPAL_STRING))) { ORTE_ERROR_LOG(rc); return rc; } /* split on commas */ fields = opal_argv_split(cptr, ','); free(cptr); /* sanity check */ if (4 > opal_argv_count(fields)) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); opal_argv_free(fields); return ORTE_ERR_BAD_PARAM; } /* store the composite parts */ /* first field is the URI */ if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_RMLURI, fields[0], OPAL_STRING))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_framework.framework_output, "%s grpcomm:pmi: proc %s oob endpoint %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&name), fields[0])); /* set the contact info into the hash table */ if (ORTE_SUCCESS != (rc = orte_rml.set_contact_info(fields[0]))) { opal_argv_free(fields); return rc; } /* next is the hostname */ if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_HOSTNAME, fields[1], OPAL_STRING))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } /* local rank */ local_rank = strtoul(fields[2], NULL, 10); if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_LOCALRANK, &local_rank, ORTE_LOCAL_RANK))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } /* node rank */ node_rank = strtoul(fields[3], NULL, 10); if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_NODERANK, &node_rank, ORTE_NODE_RANK))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } /* if the process was bound, then there will be another field * that contains its cpuset */ if (5 == opal_argv_count(fields)) { if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_CPUSET, fields[4], OPAL_STRING))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } bound = true; } else { /* store a placeholder so we know that this value was retrieved, * but the proc wasn't bound */ if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_CPUSET, NULL, OPAL_STRING))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } bound = false; } /* compute and store the locality as it isn't something that gets pushed to PMI */ if (0 != strcmp(fields[1], orte_process_info.nodename)) { /* this is on a different node, then mark as non-local */ locality = OPAL_PROC_NON_LOCAL; } else if (!bound) { /* if we share a node, but we don't know anything more, then * mark us as on the node as this is all we know */ locality = OPAL_PROC_ON_NODE; } else { /* determine relative location on our node */ locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology, orte_process_info.cpuset, fields[4]); } if (ORTE_SUCCESS != (rc = opal_db.store((opal_identifier_t*)&name, OPAL_DB_INTERNAL, ORTE_DB_LOCALITY, &locality, OPAL_HWLOC_LOCALITY_T))) { ORTE_ERROR_LOG(rc); opal_argv_free(fields); return rc; } /* cleanup */ opal_argv_free(fields); fields = NULL; } /* execute the callback */ coll->active = false; if (NULL != coll->cbfunc) { coll->cbfunc(NULL, coll->cbdata); } return rc; }
int orte_sstore_central_global_sync(orte_sstore_base_handle_t handle) { int ret, exit_status = ORTE_SUCCESS; orte_sstore_central_global_snapshot_info_t *handle_info = NULL; OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle, "sstore:central:(global): sync()")); /* * Lookup the handle */ handle_info = find_handle_info(handle); if( SSTORE_GLOBAL_SYNCING != handle_info->state ) { handle_info->state = SSTORE_GLOBAL_SYNCING; if( ORTE_SNAPC_LOCAL_COORD_TYPE == (orte_snapc_coord_type & ORTE_SNAPC_LOCAL_COORD_TYPE) ) { return orte_sstore_central_local_sync(handle); } } /* * Synchronize all of the files */ while(handle_info->num_procs_synced < handle_info->num_procs_total) { opal_progress(); } /* * Finalize and close the metadata */ if( ORTE_SUCCESS != (ret = metadata_write_timestamp(handle_info)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( handle_info->migrating ) { if( ORTE_SUCCESS != (ret = metadata_write_int(handle_info, SSTORE_METADATA_INTERNAL_DONE_MIG_SEQ_STR, handle_info->seq_num)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else { if( ORTE_SUCCESS != (ret = metadata_write_int(handle_info, SSTORE_METADATA_INTERNAL_DONE_SEQ_STR, handle_info->seq_num)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } if( ORTE_SUCCESS != (ret = metadata_close(handle_info)) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* JJH: We should lock this var! */ if( !handle_info->migrating ) { orte_sstore_base_is_checkpoint_available = true; orte_sstore_handle_last_stable = orte_sstore_handle_current; } cleanup: return exit_status; }
/** * segment_attach can only be called after a successful call to segment_create */ static void * segment_attach(opal_shmem_ds_t *ds_buf) { pid_t my_pid = getpid(); if (my_pid != ds_buf->seg_cpid) { if (-1 == (ds_buf->seg_id = open(ds_buf->seg_name, O_CREAT | O_RDWR, 0600))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "open(2)", "", strerror(err), err); return NULL; } if (MAP_FAILED == (ds_buf->seg_base_addr = (unsigned char *) mmap(NULL, ds_buf->seg_size, PROT_READ | PROT_WRITE, MAP_SHARED, ds_buf->seg_id, 0))) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "mmap(2)", "", strerror(err), err); /* mmap failed, so close the file and return NULL - no error check * here because we are already in an error path... */ close(ds_buf->seg_id); return NULL; } /* all is well */ /* if close fails here, that's okay. just let the user know and * continue. if we got this far, open and mmap were successful... */ if (0 != close(ds_buf->seg_id)) { int err = errno; char hn[MAXHOSTNAMELEN]; gethostname(hn, MAXHOSTNAMELEN - 1); hn[MAXHOSTNAMELEN - 1] = '\0'; opal_show_help("help-opal-shmem-mmap.txt", "sys call fail", 1, hn, "close(2)", "", strerror(err), err); } } /* else i was the segment creator. nothing to do here because all the hard * work was done in segment_create :-). */ OPAL_OUTPUT_VERBOSE( (70, opal_shmem_base_framework.framework_output, "%s: %s: attach successful " "(id: %d, size: %lu, name: %s)\n", mca_shmem_mmap_component.super.base_version.mca_type_name, mca_shmem_mmap_component.super.base_version.mca_component_name, ds_buf->seg_id, (unsigned long)ds_buf->seg_size, ds_buf->seg_name) ); /* update returned base pointer with an offset that hides our stuff */ return (ds_buf->seg_base_addr + sizeof(opal_shmem_seg_hdr_t)); }
static void sstore_central_global_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { int ret; orte_sstore_central_cmd_flag_t command; orte_std_cntr_t count; orte_sstore_base_handle_t loc_id; orte_sstore_central_global_snapshot_info_t *handle_info = NULL; if( ORTE_RML_TAG_SSTORE_INTERNAL != tag ) { return; } /* * If this was an application process contacting us, then act like an orted * instead of an HNP */ if(OPAL_EQUAL != orte_util_compare_name_fields(ORTE_NS_CMP_JOBID, ORTE_PROC_MY_NAME, sender)) { orte_sstore_central_local_recv(status, sender, buffer, tag, cbdata); return; } OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle, "sstore:central:(global): process_cmd(%s)", ORTE_NAME_PRINT(sender))); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) { ORTE_ERROR_LOG(ret); goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &loc_id, &count, ORTE_SSTORE_HANDLE )) ) { ORTE_ERROR_LOG(ret); goto cleanup; } /* * Find the referenced handle */ if(NULL == (handle_info = find_handle_info(loc_id)) ) { ; /* JJH big problem */ } /* * Process the command */ if( ORTE_SSTORE_CENTRAL_PULL == command ) { process_local_pull(sender, buffer, handle_info); } else if( ORTE_SSTORE_CENTRAL_PUSH == command ) { process_local_push(sender, buffer, handle_info); } cleanup: return; }
int ompi_osc_portals4_fetch_and_op(void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target, OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win) { int ret; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length; size_t offset; ptl_op_t ptl_op; ptl_datatype_t ptl_dt; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "fetch_and_op: 0x%lx, 0x%lx, %s, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, (unsigned long) result_addr, dt->name, target, (int) target_disp, op->o_name, (unsigned long) win)); ret = ompi_osc_portals4_get_dt(dt, &ptl_dt); if (OMPI_SUCCESS != ret) return ret; offset = get_displacement(module, target) * target_disp; ret = ompi_datatype_type_size(dt, &length); if (OMPI_SUCCESS != ret) return ret; assert(length < module->fetch_atomic_max); (void)opal_atomic_add_64(&module->opcount, 1); if (MPI_REPLACE == op) { ptl_handle_md_t result_md_h, origin_md_h; void *result_md_base, *origin_md_base; ptl_size_t result_md_offset, origin_md_offset; ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); result_md_offset = ((char*) result_addr - (char*) result_md_base); ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); ret = PtlSwap(result_md_h, result_md_offset, origin_md_h, origin_md_offset, length, peer, module->pt_idx, module->match_bits, offset, NULL, 0, NULL, PTL_SWAP, ptl_dt); } else if (MPI_NO_OP == op) { ptl_handle_md_t md_h; void *md_base; ptl_size_t md_offset; ompi_osc_portals4_get_md(result_addr, module->md_h, &md_h, &md_base); md_offset = ((char*) result_addr - (char*) md_base); ret = PtlGet(md_h, md_offset, length, peer, module->pt_idx, module->match_bits, offset, NULL); } else { ptl_handle_md_t result_md_h, origin_md_h; void *result_md_base, *origin_md_base; ptl_size_t result_md_offset, origin_md_offset; ret = ompi_osc_portals4_get_op(op, &ptl_op); if (OMPI_SUCCESS != ret) return ret; ompi_osc_portals4_get_md(result_addr, module->md_h, &result_md_h, &result_md_base); result_md_offset = ((char*) result_addr - (char*) result_md_base); ompi_osc_portals4_get_md(origin_addr, module->md_h, &origin_md_h, &origin_md_base); origin_md_offset = ((char*) origin_addr - (char*) origin_md_base); ret = PtlFetchAtomic(result_md_h, result_md_offset, origin_md_h, origin_md_offset, length, peer, module->pt_idx, module->match_bits, offset, NULL, 0, ptl_op, ptl_dt); } if (OMPI_SUCCESS != ret) { return ret; } return OMPI_SUCCESS; }
/* process incoming coll returns */ static void app_recv(int status, orte_process_name_t* sender, opal_buffer_t* buffer, orte_rml_tag_t tag, void* cbdata) { orte_grpcomm_collective_t *coll, *cptr; opal_list_item_t *item; int n, rc; orte_grpcomm_coll_id_t id; orte_namelist_t *nm; /* get the collective id */ n = 1; if (ORTE_SUCCESS != (rc = opal_dss.unpack(buffer, &id, &n, ORTE_GRPCOMM_COLL_ID_T))) { ORTE_ERROR_LOG(rc); return; } OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:receive processing collective return for id %d recvd from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), id, ORTE_NAME_PRINT(sender))); /* if the sender is my daemon, then this collective is * a global one and is complete */ if (ORTE_PROC_MY_DAEMON->jobid == sender->jobid && ORTE_PROC_MY_DAEMON->vpid == sender->vpid) { /* search my list of active collectives */ for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); item != opal_list_get_end(&orte_grpcomm_base.active_colls); item = opal_list_get_next(item)) { coll = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s CHECKING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), coll->id)); if (id == coll->id) { /* see if the collective needs another step */ if (NULL != coll->next_cb) { /* have to go here next */ coll->next_cb(buffer, coll->next_cbdata); break; } /* flag the collective as complete */ coll->active = false; /* cleanup */ opal_list_remove_item(&orte_grpcomm_base.active_colls, item); /* callback the specified function */ if (NULL != coll->cbfunc) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:receive executing callback", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); coll->cbfunc(buffer, coll->cbdata); } /* do NOT release the collective - it is the responsibility * of whomever passed it down to us */ break; } } return; } /* this came from another application process, so it * belongs to a non-global collective taking place * only between procs. Since there is a race condition * between when we might create our own collective and * when someone might send it to us, we may not have * the collective on our list - see if we do */ coll = NULL; for (item = opal_list_get_first(&orte_grpcomm_base.active_colls); item != opal_list_get_end(&orte_grpcomm_base.active_colls); item = opal_list_get_next(item)) { cptr = (orte_grpcomm_collective_t*)item; OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s CHECKING COLL id %d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cptr->id)); if (id == cptr->id) { /* aha - we do have it */ coll = cptr; break; } } if (NULL == coll) { /* nope - add it */ coll = OBJ_NEW(orte_grpcomm_collective_t); coll->id = id; opal_list_append(&orte_grpcomm_base.active_colls, &coll->super); } /* append the sender to the list of targets so * we know we already have their contribution */ nm = OBJ_NEW(orte_namelist_t); nm->name.jobid = sender->jobid; nm->name.vpid = sender->vpid; opal_list_append(&coll->targets, &nm->super); /* transfer the rest of the incoming data to the collection bucket. * Note that we don't transfer it to the collective's buffer * as the modex itself uses that */ opal_dss.copy_payload(&coll->local_bucket, buffer); /* if the length of the participant list equals the * length of the target list, then the collective is * complete */ if (opal_list_get_size(&coll->participants) == opal_list_get_size(&coll->targets)) { /* replace whatever is in the collective's buffer * field with what we collected */ OBJ_DESTRUCT(&coll->buffer); OBJ_CONSTRUCT(&coll->buffer, opal_buffer_t); opal_dss.copy_payload(&coll->buffer, &coll->local_bucket); /* see if the collective needs another step */ if (NULL != coll->next_cb) { /* have to go here next */ coll->next_cb(&coll->buffer, coll->next_cbdata); return; } /* flag the collective as complete */ coll->active = false; /* cleanup */ opal_list_remove_item(&orte_grpcomm_base.active_colls, item); /* callback the specified function */ if (NULL != coll->cbfunc) { OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output, "%s grpcomm:base:receive executing callback", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); coll->cbfunc(&coll->buffer, coll->cbdata); } /* do NOT release the collective - it is the responsibility * of whomever passed it down to us */ } }
int ompi_osc_portals4_raccumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **ompi_req) { int ret; ompi_osc_portals4_request_t *request; ompi_osc_portals4_module_t *module = (ompi_osc_portals4_module_t*) win->w_osc_module; ptl_process_t peer = ompi_osc_portals4_get_peer(module, target); size_t length, sent; size_t offset; ptl_op_t ptl_op; ptl_datatype_t ptl_dt; ptl_handle_md_t md_h; void *md_base; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s 0x%lx", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, (unsigned long) win)); OMPI_OSC_PORTALS4_REQUEST_ALLOC(win, request); if (NULL == request) return OMPI_ERR_TEMP_OUT_OF_RESOURCE; *ompi_req = &request->super; offset = get_displacement(module, target) * target_disp; if (!ompi_datatype_is_contiguous_memory_layout(origin_dt, origin_count) || !ompi_datatype_is_contiguous_memory_layout(target_dt, target_count)) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); opal_output(ompi_osc_base_framework.framework_output, "MPI_Raccumulate: transfer of non-contiguous memory is not currently supported.\n"); return OMPI_ERR_NOT_SUPPORTED; } else { ptl_size_t md_offset; ret = ompi_datatype_type_size(origin_dt, &length); if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } length *= origin_count; sent = 0; ompi_osc_portals4_get_md(origin_addr, module->req_md_h, &md_h, &md_base); md_offset = ((char*) origin_addr - (char*) md_base); do { size_t msg_length = MIN(module->atomic_max, length - sent); (void)opal_atomic_add_64(&module->opcount, 1); request->ops_expected++; if (MPI_REPLACE == op) { ret = PtlPut(md_h, md_offset + sent, msg_length, PTL_ACK_REQ, peer, module->pt_idx, module->match_bits, offset + sent, request, 0); } else { ret = ompi_osc_portals4_get_dt(origin_dt, &ptl_dt); if (OMPI_SUCCESS != ret) return ret; ret = ompi_osc_portals4_get_op(op, &ptl_op); if (OMPI_SUCCESS != ret) return ret; ret = PtlAtomic(md_h, offset + sent, msg_length, PTL_ACK_REQ, peer, module->pt_idx, module->match_bits, offset + sent, request, 0, ptl_op, ptl_dt); } if (OMPI_SUCCESS != ret) { OMPI_OSC_PORTALS4_REQUEST_RETURN(request); return ret; } sent += msg_length; } while (sent < length); } return OMPI_SUCCESS; }
static int hostfile_parse_line(int token, opal_list_t* updates, opal_list_t* exclude, bool keep_all) { int rc; orte_node_t* node; bool got_count = false; bool got_max = false; char* value; char** argv; char* node_name = NULL; char* node_alias = NULL; char* username = NULL; int cnt; int number_of_slots = 0; char buff[64]; if (ORTE_HOSTFILE_STRING == token || ORTE_HOSTFILE_HOSTNAME == token || ORTE_HOSTFILE_INT == token || ORTE_HOSTFILE_IPV4 == token || ORTE_HOSTFILE_IPV6 == token) { if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* if the first letter of the name is '^', then this is a node * to be excluded. Remove the ^ character so the nodename is * usable, and put it on the exclude list */ if ('^' == node_name[0]) { int i, len; len = strlen(node_name); for (i=1; i < len; i++) { node_name[i-1] = node_name[i]; } node_name[len-1] = '\0'; /* truncate */ OPAL_OUTPUT_VERBOSE((3, orte_debug_output, "%s hostfile: node %s is being excluded", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name)); /* convert this into something globally unique */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } /* Do we need to make a new node object? First check to see if it's already in the exclude list */ if (NULL == (node = hostfile_lookup(exclude, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* Note that we need to add this back to the exclude list. If it was found, we just removed it (in hostfile_lookup()), so this puts it back. If it was not found, then we have to add it to the exclude list anyway. */ opal_list_append(exclude, &node->super); return ORTE_SUCCESS; } /* this is not a node to be excluded, so we need to process it and * add it to the "include" list. See if this host is actually us. */ if (strcmp(node_name, "localhost") == 0 || opal_ifislocal(node_name)) { /* Nodename has been allocated, that is for sure */ if (orte_show_resolved_nodenames && 0 != strcmp(node_name, orte_process_info.nodename)) { node_alias = strdup(node_name); } free (node_name); node_name = strdup(orte_process_info.nodename); } OPAL_OUTPUT_VERBOSE((3, orte_debug_output, "%s hostfile: node %s is being included - keep all is %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node_name, keep_all ? "TRUE" : "FALSE")); /* Do we need to make a new node object? First check to see * if we are keeping everything or if it's already in the updates * list. Because we check keep_all first, if that is set we will * not do the hostfile_lookup call, and thus won't remove the * pre-existing node from the updates list */ if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* do we need to record an alias for this node? */ if (NULL != node_alias) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, node_alias, false); free(node_alias); } } else if (ORTE_HOSTFILE_RELATIVE == token) { /* store this for later processing */ node = OBJ_NEW(orte_node_t); node->name = strdup(orte_util_hostfile_value.sval); } else if (ORTE_HOSTFILE_RANK == token) { /* we can ignore the rank, but we need to extract the node name. we * first need to shift over to the other side of the equal sign as * this is where the node name will be */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_EQUAL != token) { token = orte_util_hostfile_lex(); } if (orte_util_hostfile_done) { /* bad syntax somewhere */ return ORTE_ERROR; } /* next position should be the node name */ token = orte_util_hostfile_lex(); if(ORTE_HOSTFILE_INT == token) { snprintf(buff, 64, "%d", orte_util_hostfile_value.ival); value = buff; } else { value = orte_util_hostfile_value.sval; } argv = opal_argv_split (value, '@'); cnt = opal_argv_count (argv); if (1 == cnt) { node_name = strdup(argv[0]); } else if (2 == cnt) { username = strdup(argv[0]); node_name = strdup(argv[1]); } else { opal_output(0, "WARNING: Unhandled user@host-combination\n"); /* XXX */ } opal_argv_free (argv); /* Do we need to make a new node object? First check to see * if we are keeping everything or if it's already in the updates * list. Because we check keep_all first, if that is set we will * not do the hostfile_lookup call, and thus won't remove the * pre-existing node from the updates list */ if (keep_all || NULL == (node = hostfile_lookup(updates, node_name))) { node = OBJ_NEW(orte_node_t); node->name = node_name; if (NULL != username) { node->username = strdup(username); } } /* add a slot */ node->slots++; /* do we need to record an alias for this node? */ if (NULL != node_alias) { /* add to list of aliases for this node - only add if unique */ opal_argv_append_unique_nosize(&node->alias, node_alias, false); free(node_alias); } /* skip to end of line */ while (!orte_util_hostfile_done && ORTE_HOSTFILE_NEWLINE != token) { token = orte_util_hostfile_lex(); } opal_list_append(updates, &node->super); return ORTE_SUCCESS; } else { hostfile_parse_error(token); return ORTE_ERROR; } got_count = false; while (!orte_util_hostfile_done) { token = orte_util_hostfile_lex(); switch (token) { case ORTE_HOSTFILE_DONE: goto done; case ORTE_HOSTFILE_NEWLINE: goto done; case ORTE_HOSTFILE_USERNAME: node->username = hostfile_parse_string(); break; case ORTE_HOSTFILE_BOARDS: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "boards", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } node->boards = rc; break; case ORTE_HOSTFILE_SOCKETS_PER_BOARD: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "sockets", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } node->sockets_per_board = rc; break; case ORTE_HOSTFILE_CORES_PER_SOCKET: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "cores", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } node->cores_per_socket = rc; break; case ORTE_HOSTFILE_CPU_SET: if (NULL != node->cpu_set) { free(node->cpu_set); } node->cpu_set = hostfile_parse_string(); break; case ORTE_HOSTFILE_COUNT: case ORTE_HOSTFILE_CPU: case ORTE_HOSTFILE_SLOTS: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "slots", true, cur_hostfile_name, rc); OBJ_RELEASE(node); return ORTE_ERROR; } node->slots += rc; got_count = true; /* Ensure that slots_max >= slots */ if (node->slots_max != 0 && node->slots_max < node->slots) { node->slots_max = node->slots; } break; case ORTE_HOSTFILE_SLOTS_MAX: rc = hostfile_parse_int(); if (rc < 0) { orte_show_help("help-hostfile.txt", "max_slots", true, cur_hostfile_name, ((size_t) rc)); OBJ_RELEASE(node); return ORTE_ERROR; } /* Only take this update if it puts us >= node_slots */ if (rc >= node->slots) { if (node->slots_max != rc) { node->slots_max = rc; got_max = true; } } else { orte_show_help("help-hostfile.txt", "max_slots_lt", true, cur_hostfile_name, node->slots, rc); ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } break; default: hostfile_parse_error(token); OBJ_RELEASE(node); return ORTE_ERROR; } if (number_of_slots > node->slots) { ORTE_ERROR_LOG(ORTE_ERR_BAD_PARAM); OBJ_RELEASE(node); return ORTE_ERROR; } } done: if (!got_count) { if (got_max) { node->slots = node->slots_max; } else { ++node->slots; } } opal_list_append(updates, &node->super); return ORTE_SUCCESS; }