/* * Front-end function to delete all the attributes on an MPI object */ int ompi_attr_delete_all(ompi_attribute_type_t type, void *object, opal_hash_table_t *attr_hash) { int ret, i, num_attrs; uint32_t key; void *node, *in_node, *attr; attribute_value_t **attrs; /* Ensure that the table is not empty */ if (NULL == attr_hash) { return MPI_SUCCESS; } OPAL_THREAD_LOCK(&attribute_lock); /* Make an array that contains all attributes in local object's hash */ num_attrs = opal_hash_table_get_size(attr_hash); if (0 == num_attrs) { OPAL_THREAD_UNLOCK(&attribute_lock); return MPI_SUCCESS; } attrs = malloc(sizeof(attribute_value_t *) * num_attrs); if (NULL == attrs) { OPAL_THREAD_UNLOCK(&attribute_lock); return OMPI_ERR_OUT_OF_RESOURCE; } ret = opal_hash_table_get_first_key_uint32(attr_hash, &key, &attr, &node); for (i = 0; OMPI_SUCCESS == ret; i++) { attrs[i] = attr; in_node = node; ret = opal_hash_table_get_next_key_uint32(attr_hash, &key, &attr, in_node, &node); } /* Sort attributes in the order that they were set */ qsort(attrs, num_attrs, sizeof(attribute_value_t *), compare_attr_sequence); /* Delete attributes in the reverse order that they were set. Actually this ordering is required only for MPI_COMM_SELF, as specified in MPI-2.2: 8.7.1 Allowing User Functions at Process Termination, but we do it for everything -- what the heck. :-) */ for (i = num_attrs - 1; i >= 0; i--) { ret = ompi_attr_delete_impl(type, object, attr_hash, attrs[i]->av_key, true); if (OMPI_SUCCESS != ret) { break; } } /* All done */ free(attrs); opal_atomic_wmb(); OPAL_THREAD_UNLOCK(&attribute_lock); return ret; }
int ompi_osc_rdma_flush_all (struct ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_sync_t *lock; int ret = OMPI_SUCCESS; uint32_t key; void *node; /* flush is only allowed from within a passive target epoch */ if (!ompi_osc_rdma_in_passive_epoch (module)) { return OMPI_ERR_RMA_SYNC; } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all: %s", win->w_name); /* globally complete all outstanding rdma requests */ if (OMPI_OSC_RDMA_SYNC_TYPE_LOCK == module->all_sync.type) { ompi_osc_rdma_sync_rdma_complete (&module->all_sync); } /* flush all locks */ ret = opal_hash_table_get_first_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, &node); while (OPAL_SUCCESS == ret) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "flushing lock %p", (void *) lock); ompi_osc_rdma_sync_rdma_complete (lock); ret = opal_hash_table_get_next_key_uint32 (&module->outstanding_locks, &key, (void **) &lock, node, &node); } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "flush_all complete"); return OPAL_SUCCESS; }
/* * Delete all the attributes on an MPI object */ int ompi_attr_delete_all(ompi_attribute_type_t type, void *object, opal_hash_table_t *attr_hash) { int key_ret, del_ret; uint32_t key, oldkey; void *node, *in_node, *old_attr; /* Ensure that the table is not empty */ if (NULL == attr_hash) { return MPI_SUCCESS; } /* Lock this whole sequence of events -- don't let any other thread modify the structure of the attribute hash or bitmap while we're traversing it */ OPAL_THREAD_LOCK(&attr_hash_lock); /* Get the first key in local object's hash */ key_ret = opal_hash_table_get_first_key_uint32(attr_hash, &key, &old_attr, &node); OPAL_THREAD_UNLOCK(&attr_hash_lock); del_ret = OMPI_SUCCESS; while (OMPI_SUCCESS == key_ret && OMPI_SUCCESS == del_ret) { /* Save this node info for deletion, before we move onto the next node */ in_node = node; oldkey = key; /* Move to the next node */ OPAL_THREAD_LOCK(&attr_hash_lock); key_ret = opal_hash_table_get_next_key_uint32(attr_hash, &key, &old_attr, in_node, &node); OPAL_THREAD_UNLOCK(&attr_hash_lock); /* Now delete this attribute */ del_ret = ompi_attr_delete(type, object, attr_hash, oldkey, true); } /* All done */ return del_ret; }
/** * Function to pack all the entries in the SOS table and send it * over to the HNP. * * @return OPAL_SUCCESS Upon success * @return OPAL_FAILURE Upon failure * * ADK: Presently, we simply rely on orte_show_help to do the aggregation on * a per-error basis. */ static int opal_sos_send_table(void) { opal_sos_error_t *opal_error; opal_buffer_t *buf; uint32_t key; int rc; size_t table_size; void *prev_error, *next_error; next_error = NULL; buf = OBJ_NEW(opal_buffer_t); if (NULL == buf) { return ORTE_ERR_OUT_OF_RESOURCE; } OPAL_THREAD_LOCK(&opal_sos_table_lock); table_size = opal_hash_table_get_size(&opal_sos_table); /* Pack the size of the SOS error table */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &table_size, 1, OPAL_SIZE))) { ORTE_ERROR_LOG(rc); goto error; } if (OPAL_SUCCESS != opal_hash_table_get_first_key_uint32(&opal_sos_table, &key, (void**)&opal_error, &prev_error)) { rc = ORTE_ERROR; goto error; } /* Pack the sos error object */ if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { ORTE_ERROR_LOG(rc); goto error; } while (OPAL_SUCCESS == opal_hash_table_get_next_key_uint32(&opal_sos_table, &key, (void**)&opal_error, &prev_error, &next_error)) { if (ORTE_SUCCESS != (rc = opal_dss_pack_sos_error(buf, opal_error))) { ORTE_ERROR_LOG(rc); goto error; } } OPAL_THREAD_UNLOCK(&opal_sos_table_lock); /* Now send the buffer (rc = number of bytes sent) */ rc = orte_rml.send_buffer(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_NOTIFIER_HNP, 0); if (rc <= 0) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); return rc; } return ORTE_SUCCESS; error: OPAL_THREAD_UNLOCK(&opal_sos_table_lock); OBJ_RELEASE(buf); return rc; }
/* * Copy all the attributes from one MPI object to another. Called * when MPI objects are copied (e.g., back-end actions to * MPI_COMM_DUP). */ int ompi_attr_copy_all(ompi_attribute_type_t type, void *old_object, void *new_object, opal_hash_table_t *oldattr_hash, opal_hash_table_t *newattr_hash) { int ret; int err; uint32_t key; int flag; void *node, *in_node; attribute_value_t *old_attr, *new_attr; ompi_attribute_keyval_t *hash_value; /* If there's nothing to do, just return */ if (NULL == oldattr_hash) { return MPI_SUCCESS; } OPAL_THREAD_LOCK(&attribute_lock); /* Get the first attribute in the object's hash */ ret = opal_hash_table_get_first_key_uint32(oldattr_hash, &key, (void **) &old_attr, &node); /* While we still have some attribute in the object's key hash */ while (OMPI_SUCCESS == ret) { in_node = node; /* Get the keyval in the main keyval hash - so that we know what the copy_attr_fn is */ err = opal_hash_table_get_value_uint32(keyval_hash, key, (void **) &hash_value); if (OMPI_SUCCESS != err) { /* This should not happen! */ ret = MPI_ERR_INTERN; goto out; } err = 0; new_attr = OBJ_NEW(attribute_value_t); switch (type) { case COMM_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(communicator, old_object, hash_value, old_attr, new_object, new_attr, err); break; case TYPE_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(datatype, old_object, hash_value, old_attr, new_object, new_attr, err); break; case WIN_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(win, old_object, hash_value, old_attr, new_object, new_attr, err); break; default: /* This should not happen */ assert(0); break; } /* Did the callback return non-MPI_SUCCESS? */ if (0 != err) { ret = err; goto out; } /* Hang this off the object's hash */ /* The COPY_ATTR_CALLBACKS macro will have converted the _flag_ callback output value from Fortran's .TRUE. value to 0/1 (if necessary). So we only need to check for 0/1 here -- not .TRUE. */ if (1 == flag) { if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77)) { if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77_INT)) { new_attr->av_set_from = OMPI_ATTRIBUTE_FINT; } else { new_attr->av_set_from = OMPI_ATTRIBUTE_AINT; } } else { new_attr->av_set_from = OMPI_ATTRIBUTE_C; } ret = set_value(type, new_object, &newattr_hash, key, new_attr, true); if (MPI_SUCCESS != ret) { goto out; } } else { OBJ_RELEASE(new_attr); } ret = opal_hash_table_get_next_key_uint32(oldattr_hash, &key, (void **) &old_attr, in_node, &node); } ret = MPI_SUCCESS; out: /* All done */ opal_atomic_wmb(); OPAL_THREAD_UNLOCK(&attribute_lock); return ret; }
/* * Copy all the attributes from one MPI object to another */ int ompi_attr_copy_all(ompi_attribute_type_t type, void *old_object, void *new_object, opal_hash_table_t *oldattr_hash, opal_hash_table_t *newattr_hash) { int ret; int err; uint32_t key; int flag; void *node, *in_node; attribute_value_t *old_attr, *new_attr; ompi_attribute_keyval_t *hash_value; /* If there's nothing to do, just return */ if (NULL == oldattr_hash) { return MPI_SUCCESS; } /* Lock this whole sequence of events -- don't let any other thread modify the structure of the attrbitue hash or bitmap while we're traversing it */ OPAL_THREAD_LOCK(&attr_hash_lock); /* Get the first attribute in the object's hash */ ret = opal_hash_table_get_first_key_uint32(oldattr_hash, &key, (void **) &old_attr, &node); OPAL_THREAD_UNLOCK(&attr_hash_lock); /* While we still have some attribute in the object's key hash */ while (OMPI_SUCCESS == ret) { in_node = node; /* Get the keyval in the main keyval hash - so that we know what the copy_attr_fn is */ OPAL_THREAD_LOCK(&keyval_hash_lock); err = opal_hash_table_get_value_uint32(keyval_hash, key, (void **) &hash_value); OPAL_THREAD_UNLOCK(&keyval_hash_lock); new_attr = OBJ_NEW(attribute_value_t); switch (type) { case UNUSED_ATTR: /* keep the compiler happy */ assert(0); break; case COMM_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(communicator, old_object, hash_value, old_attr, new_object, new_attr); break; case TYPE_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(datatype, old_object, hash_value, old_attr, new_object, new_attr); break; case WIN_ATTR: /* Now call the copy_attr_fn */ COPY_ATTR_CALLBACKS(win, old_object, hash_value, old_attr, new_object, new_attr); break; } /* Hang this off the object's hash */ /* The "predefined" parameter to ompi_attr_set() is set to 1, so that no comparison is done for prdefined at all and it just falls off the error checking loop in attr_set */ if (1 == flag) { if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77)) { if (0 != (hash_value->attr_flag & OMPI_KEYVAL_F77_MPI1)) { new_attr->av_set_from = OMPI_ATTRIBUTE_FORTRAN_MPI1; } else { new_attr->av_set_from = OMPI_ATTRIBUTE_FORTRAN_MPI2; } } else { new_attr->av_set_from = OMPI_ATTRIBUTE_C; } set_value(type, new_object, &newattr_hash, key, new_attr, true); } else { OBJ_RELEASE(new_attr); } OPAL_THREAD_LOCK(&attr_hash_lock); ret = opal_hash_table_get_next_key_uint32(oldattr_hash, &key, (void **) &old_attr, in_node, &node); OPAL_THREAD_UNLOCK(&attr_hash_lock); } /* All done */ return MPI_SUCCESS; }
int ompi_osc_pt2pt_free(ompi_win_t *win) { int ret = OMPI_SUCCESS; ompi_osc_pt2pt_module_t *module = GET_MODULE(win); ompi_osc_pt2pt_peer_t *peer; uint32_t key; void *node; if (NULL == module) { return OMPI_SUCCESS; } if (NULL != module->comm) { opal_output_verbose(1, ompi_osc_base_framework.framework_output, "pt2pt component destroying window with id %d", ompi_comm_get_cid(module->comm)); /* finish with a barrier */ if (ompi_group_size(win->w_group) > 1) { (void) module->comm->c_coll.coll_barrier (module->comm, module->comm->c_coll.coll_barrier_module); } /* remove from component information */ OPAL_THREAD_SCOPED_LOCK(&mca_osc_pt2pt_component.lock, opal_hash_table_remove_value_uint32(&mca_osc_pt2pt_component.modules, ompi_comm_get_cid(module->comm))); } win->w_osc_module = NULL; OBJ_DESTRUCT(&module->outstanding_locks); OBJ_DESTRUCT(&module->locks_pending); OBJ_DESTRUCT(&module->locks_pending_lock); OBJ_DESTRUCT(&module->cond); OBJ_DESTRUCT(&module->lock); OBJ_DESTRUCT(&module->all_sync); /* it is erroneous to close a window with active operations on it so we should * probably produce an error here instead of cleaning up */ OPAL_LIST_DESTRUCT(&module->pending_acc); osc_pt2pt_gc_clean (module); OPAL_LIST_DESTRUCT(&module->request_gc); OPAL_LIST_DESTRUCT(&module->buffer_gc); OBJ_DESTRUCT(&module->gc_lock); ret = opal_hash_table_get_first_key_uint32 (&module->peer_hash, &key, (void **) &peer, &node); while (OPAL_SUCCESS == ret) { OBJ_RELEASE(peer); ret = opal_hash_table_get_next_key_uint32 (&module->peer_hash, &key, (void **) &peer, node, &node); } OBJ_DESTRUCT(&module->peer_hash); OBJ_DESTRUCT(&module->peer_lock); if (NULL != module->epoch_outgoing_frag_count) free(module->epoch_outgoing_frag_count); if (NULL != module->frag_request && MPI_REQUEST_NULL != module->frag_request) { module->frag_request->req_complete_cb = NULL; ompi_request_cancel (module->frag_request); ompi_request_free (&module->frag_request); } if (NULL != module->comm) { ompi_comm_free(&module->comm); } if (NULL != module->incoming_buffer) free (module->incoming_buffer); if (NULL != module->free_after) free(module->free_after); free (module); return OMPI_SUCCESS; }
void orte_state_base_check_all_complete(int fd, short args, void *cbdata) { orte_state_caddy_t *caddy = (orte_state_caddy_t*)cbdata; orte_job_t *jdata = caddy->jdata; orte_proc_t *proc; int i; orte_std_cntr_t j; orte_job_t *job; orte_node_t *node; orte_job_map_t *map; orte_std_cntr_t index; bool one_still_alive; orte_vpid_t lowest=0; int32_t i32, *i32ptr; uint32_t u32; void *nptr; char *rtmod; opal_output_verbose(2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete on job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == jdata) ? "NULL" : ORTE_JOBID_PRINT(jdata->jobid)); /* get our "lifeline" routed module */ rtmod = orte_rml.get_routed(orte_mgmt_conduit); if (NULL == jdata || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { /* just check to see if the daemons are complete */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_complete - received NULL job, checking daemons", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); goto CHECK_DAEMONS; } else { /* mark the job as terminated, but don't override any * abnormal termination flags */ if (jdata->state < ORTE_JOB_STATE_UNTERMINATED) { jdata->state = ORTE_JOB_STATE_TERMINATED; } } /* tell the IOF that the job is complete */ if (NULL != orte_iof.complete) { orte_iof.complete(jdata); } /* tell the PMIx server to release its data */ if (NULL != opal_pmix.server_deregister_nspace) { opal_pmix.server_deregister_nspace(jdata->jobid, NULL, NULL); } i32ptr = &i32; if (orte_get_attribute(&jdata->attributes, ORTE_JOB_NUM_NONZERO_EXIT, (void**)&i32ptr, OPAL_INT32) && !orte_abort_non_zero_exit) { if (!orte_report_child_jobs_separately || 1 == ORTE_LOCAL_JOBID(jdata->jobid)) { /* update the exit code */ ORTE_UPDATE_EXIT_STATUS(lowest); } /* warn user */ opal_output(orte_clean_output, "-------------------------------------------------------\n" "While %s job %s terminated normally, %d %s. Further examination may be required.\n" "-------------------------------------------------------", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "the primary" : "child", (1 == ORTE_LOCAL_JOBID(jdata->jobid)) ? "" : ORTE_LOCAL_JOBID_PRINT(jdata->jobid), i32, (1 == i32) ? "process returned\na non-zero exit code." : "processes returned\nnon-zero exit codes."); } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed declared job %s terminated with state %s - checking all jobs", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), orte_job_state_to_str(jdata->state))); /* if this job is a continuously operating one, then don't do * anything further - just return here */ if (NULL != jdata && (orte_get_attribute(&jdata->attributes, ORTE_JOB_CONTINUOUS_OP, NULL, OPAL_BOOL) || ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RECOVERABLE))) { goto CHECK_ALIVE; } /* if the job that is being checked is the HNP, then we are * trying to terminate the orteds. In that situation, we * do -not- check all jobs - we simply notify the HNP * that the orteds are complete. Also check special case * if jdata is NULL - we want * to definitely declare the job done if the orteds * have completed, no matter what else may be happening. * This can happen if a ctrl-c hits in the "wrong" place * while launching */ CHECK_DAEMONS: if (jdata == NULL || jdata->jobid == ORTE_PROC_MY_NAME->jobid) { if (0 == orte_routed.num_routes(rtmod)) { /* orteds are done! */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s orteds complete - exiting", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (NULL == jdata) { jdata = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); } ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DAEMONS_TERMINATED); OBJ_RELEASE(caddy); return; } OBJ_RELEASE(caddy); return; } /* Release the resources used by this job. Since some errmgrs may want * to continue using resources allocated to the job as part of their * fault recovery procedure, we only do this once the job is "complete". * Note that an aborted/killed job -is- flagged as complete and will * therefore have its resources released. We need to do this after * we call the errmgr so that any attempt to restart the job will * avoid doing so in the exact same place as the current job */ if (NULL != jdata->map && jdata->state == ORTE_JOB_STATE_TERMINATED) { map = jdata->map; for (index = 0; index < map->nodes->size; index++) { if (NULL == (node = (orte_node_t*)opal_pointer_array_get_item(map->nodes, index))) { continue; } OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing procs for job %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(jdata->jobid), node->name)); for (i = 0; i < node->procs->size; i++) { if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, i))) { continue; } if (proc->name.jobid != jdata->jobid) { /* skip procs from another job */ continue; } node->slots_inuse--; node->num_procs--; OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s releasing proc %s from node %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&proc->name), node->name)); /* set the entry in the node array to NULL */ opal_pointer_array_set_item(node->procs, i, NULL); /* release the proc once for the map entry */ OBJ_RELEASE(proc); } /* set the node location to NULL */ opal_pointer_array_set_item(map->nodes, index, NULL); /* maintain accounting */ OBJ_RELEASE(node); /* flag that the node is no longer in a map */ ORTE_FLAG_UNSET(node, ORTE_NODE_FLAG_MAPPED); } OBJ_RELEASE(map); jdata->map = NULL; } CHECK_ALIVE: /* now check to see if all jobs are done - trigger notification of this jdata * object when we find it */ one_still_alive = false; j = opal_hash_table_get_first_key_uint32(orte_job_data, &u32, (void **)&job, &nptr); while (OPAL_SUCCESS == j) { /* skip the daemon job */ if (job->jobid == ORTE_PROC_MY_NAME->jobid) { goto next; } /* if this is the job we are checking AND it normally terminated, * then activate the "notify_completed" state - this will release * the job state, but is provided so that the HNP main code can * take alternative actions if desired. If the state is killed_by_cmd, * then go ahead and release it. We cannot release it if it * abnormally terminated as mpirun needs the info so it can * report appropriately to the user * * NOTE: do not release the primary job (j=1) so we * can pretty-print completion message */ if (NULL != jdata && job->jobid == jdata->jobid) { if (jdata->state == ORTE_JOB_STATE_TERMINATED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is terminated - activating notify", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_NOTIFY_COMPLETED); one_still_alive = true; } else if (jdata->state == ORTE_JOB_STATE_KILLED_BY_CMD || jdata->state == ORTE_JOB_STATE_NOTIFIED) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed state is killed or notified - cleaning up", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* release this object, ensuring that the * pointer array internal accounting * is maintained! */ if (1 < j) { if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_DEBUGGER_DAEMON)) { /* this was a debugger daemon. notify that a debugger has detached */ ORTE_ACTIVATE_JOB_STATE(jdata, ORTE_JOB_STATE_DEBUGGER_DETACH); } OBJ_RELEASE(jdata); } } goto next; } /* if the job is flagged to not be monitored, skip it */ if (ORTE_FLAG_TEST(job, ORTE_JOB_FLAG_DO_NOT_MONITOR)) { goto next; } /* when checking for job termination, we must be sure to NOT check * our own job as it - rather obviously - has NOT terminated! */ if (ORTE_JOB_STATE_NOTIFIED != job->state) { /* we have at least one job that is not done yet - we cannot * just return, though, as we need to ensure we cleanout the * job data for the job that just completed */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is not terminated (%d:%d)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs)); one_still_alive = true; } else { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed job %s is terminated (%d vs %d [%s])", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job->jobid), job->num_terminated, job->num_procs, (NULL == jdata) ? "UNKNOWN" : orte_job_state_to_str(jdata->state) )); } next: j = opal_hash_table_get_next_key_uint32(orte_job_data, &u32, (void **)&job, nptr, &nptr); } /* if a job is still alive, we just return */ if (one_still_alive) { OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed at least one job is not terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); OBJ_RELEASE(caddy); return; } /* if we get here, then all jobs are done, so terminate */ OPAL_OUTPUT_VERBOSE((2, orte_state_base_framework.framework_output, "%s state:base:check_job_completed all jobs terminated", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* stop the job timeout event, if set */ if (NULL != orte_mpiexec_timeout) { OBJ_RELEASE(orte_mpiexec_timeout); orte_mpiexec_timeout = NULL; } /* set the exit status to 0 - this will only happen if it * wasn't already set by an error condition */ ORTE_UPDATE_EXIT_STATUS(0); /* order daemon termination - this tells us to cleanup * our local procs as well as telling remote daemons * to die */ orte_plm.terminate_orteds(); OBJ_RELEASE(caddy); }