void orte_ess_base_app_abort(int status, bool report) { orte_daemon_cmd_flag_t cmd=ORTE_DAEMON_ABORT_CALLED; opal_buffer_t *buf; /* Exit - do NOT do a normal finalize as this will very likely * hang the process. We are aborting due to an abnormal condition * that precludes normal cleanup * * We do need to do the following bits to make sure we leave a * clean environment. Taken from orte_finalize(): * - Assume errmgr cleans up child processes before we exit. */ /* CRS cleanup since it may have a named pipe and thread active */ orte_cr_finalize(); /* If we were asked to report this termination, do so */ if (report) { buf = OBJ_NEW(opal_buffer_t); opal_dss.pack(buf, &cmd, 1, ORTE_DAEMON_CMD); orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buf, ORTE_RML_TAG_DAEMON, 0, orte_rml_send_callback, NULL); OPAL_OUTPUT_VERBOSE((5, orte_debug_output, "%s orte_ess_app_abort: sent abort msg to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); /* get the ack - need this to ensure that the sync communication * gets serviced by the event library on the orted prior to the * process exiting */ sync_waiting = true; if (ORTE_SUCCESS != orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ABORT, ORTE_RML_NON_PERSISTENT, report_sync, NULL)) { exit(status); } ORTE_WAIT_FOR_COMPLETION(sync_waiting); } /* - Clean out the global structures * (not really necessary, but good practice) */ orte_proc_info_finalize(); /* Now Exit */ exit(status); }
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) { /* the radix module routes all proc communications through * the local daemon. Daemons must identify which of their * daemon-peers is "hosting" the specified recipient and * route the message to that daemon. Daemon contact info * is handled elsewhere, so all we need to do here is * ensure that the procs are told to route through their * local daemon, and that daemons are told how to route * for each proc */ int rc; /* if I am a tool, then I stand alone - there is nothing to do */ if (ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } /* if I am a daemon or HNP, then I have to extract the routing info for this job * from the data sent to me for launch and update the routing tables to * point at the daemon for each proc */ if (ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for daemon job %s\n\thnp_uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); if (NULL == ndat) { /* indicates this is being called during orte_init. * Get the HNP's name for possible later use */ if (NULL == orte_process_info.my_hnp_uri) { /* fatal error */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_process_info.my_hnp_uri); /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* if we are using static ports, set my lifeline to point at my parent */ if (orte_static_ports) { lifeline = ORTE_PROC_MY_PARENT; } else { /* set our lifeline to the HNP - we will abort if that connection is lost */ lifeline = ORTE_PROC_MY_HNP; } /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init */ } else { /* ndat != NULL means we are getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); } return rc; } OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_radix: completed init routes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for HNP job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); if (NULL == ndat) { /* the HNP has no lifeline */ lifeline = NULL; } else { /* if this is for my own jobid, then I am getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_PROC_MY_NAME->jobid == job) { if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } else { /* if not, then I need to process the callback */ if (ORTE_SUCCESS != (rc = orte_routed_base_process_callback(job, ndat))) { ORTE_ERROR_LOG(rc); return rc; } } } return ORTE_SUCCESS; } { /* MUST BE A PROC */ /* if ndat != NULL, then this is being invoked by the proc to * init a route to a specified process that is outside of our * job family. We want that route to go through our HNP, routed via * out local daemon - however, we cannot know for * certain that the HNP already knows how to talk to the specified * procs. For example, in OMPI's publish/subscribe procedures, the * DPM framework looks for an mca param containing the global ompi-server's * uri. This info will come here so the proc can setup a route to * the server - we need to pass the routing info to our HNP */ if (NULL != ndat) { int rc; opal_buffer_t *xfer; orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD; bool ack_waiting; OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes w/non-NULL data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) { /* if this is for a different job family, then we route via our HNP * to minimize connection counts to entities such as ompi-server, so * start by sending the contact info to the HNP for update */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_init_routes: diff job family - sending update to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); /* prep the buffer for transmission to the HNP */ xfer = OBJ_NEW(opal_buffer_t); opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD); opal_dss.copy_payload(xfer, ndat); /* save any new connections for use in subsequent connect_accept calls */ orte_routed_base_update_hnps(ndat); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer, ORTE_RML_TAG_RML_INFO_UPDATE, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return rc; } /* wait right here until the HNP acks the update to ensure that * any subsequent messaging can succeed */ ack_waiting = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, &ack_waiting); ORTE_WAIT_FOR_COMPLETION(ack_waiting); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our get_route function automatically routes all messages for * other job families via the HNP, so nothing more to do here */ } return ORTE_SUCCESS; } /* if ndat=NULL, then we are being called during orte_init. In this * case, we need to setup a few critical pieces of info */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_radix: init routes for proc job %s\n\thnp_uri %s\n\tdaemon uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri, (NULL == orte_process_info.my_daemon_uri) ? "NULL" : orte_process_info.my_daemon_uri)); if (NULL == orte_process_info.my_daemon_uri) { /* in this module, we absolutely MUST have this information - if * we didn't get it, then error out */ opal_output(0, "%s ERROR: Failed to identify the local daemon's URI", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: This is a fatal condition when the radix router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: has been selected - either select the unity router", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); opal_output(0, "%s ERROR: or ensure that the local daemon info is provided", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERR_FATAL; } /* we have to set the HNP's name, even though we won't route messages directly * to it. This is required to ensure that we -do- send messages to the correct * HNP name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* Set the contact info in the RML - this won't actually establish * the connection, but just tells the RML how to reach the daemon * if/when we attempt to send to it */ orte_rml.set_contact_info(orte_process_info.my_daemon_uri); /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set our lifeline to the local daemon - we will abort if this connection is lost */ lifeline = ORTE_PROC_MY_DAEMON; /* register ourselves -this sends a message to the daemon (warming up that connection) * and sends our contact info to the HNP when all local procs have reported * * NOTE: it may seem odd that we send our contact info to the HNP - after all, * the HNP doesn't really need to know how to talk to us directly if we are * using this routing method. However, this is good for two reasons: * * (1) some debuggers and/or tools may need RML contact * info to set themselves up * * (2) doing so allows the HNP to "block" in a dynamic launch * until all procs are reported running, thus ensuring that no communication * is attempted until the overall ORTE system knows how to talk to everyone - * otherwise, the system can just hang. */ if (ORTE_SUCCESS != (rc = orte_routed_base_register_sync(true))) { ORTE_ERROR_LOG(rc); return rc; } /* no answer is expected or coming */ return ORTE_SUCCESS; } }
int main(int argc, char *argv[]){ int count; int msgsize; uint8_t *msg; int i, j, rc; orte_process_name_t peer; double maxpower; opal_buffer_t *buf; orte_rml_recv_cb_t blob; /* * Init */ orte_init(&argc, &argv, ORTE_PROC_NON_MPI); if (argc > 1) { count = atoi(argv[1]); if (count < 0) { count = INT_MAX-1; } } else { count = MAX_COUNT; } peer.jobid = ORTE_PROC_MY_NAME->jobid; peer.vpid = ORTE_PROC_MY_NAME->vpid + 1; if (peer.vpid == orte_process_info.num_procs) { peer.vpid = 0; } for (j=1; j < count+1; j++) { /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup the initiating buffer - put random sized message in it */ buf = OBJ_NEW(opal_buffer_t); maxpower = (double)(j%7); msgsize = (int)pow(10.0, maxpower); opal_output(0, "Ring %d message size %d bytes", j, msgsize); msg = (uint8_t*)malloc(msgsize); opal_dss.pack(buf, msg, msgsize, OPAL_BYTE); free(msg); orte_rml.send_buffer_nb(&peer, buf, MY_TAG, orte_rml_send_callback, NULL); /* wait for it to come around */ OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); blob.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &blob); ORTE_WAIT_FOR_COMPLETION(blob.active); OBJ_DESTRUCT(&blob); opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } else { /* wait for msg */ OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); blob.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &blob); ORTE_WAIT_FOR_COMPLETION(blob.active); opal_output(0, "%s received message %d from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, ORTE_NAME_PRINT(&blob.name)); /* send it along */ buf = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(buf, &blob.data); OBJ_DESTRUCT(&blob); msg_active = true; orte_rml.send_buffer_nb(&peer, buf, MY_TAG, send_callback, NULL); ORTE_WAIT_FOR_COMPLETION(msg_active); } } orte_finalize(); return 0; }
static int pull_handle_info(orte_sstore_central_app_snapshot_info_t *handle_info ) { int ret, exit_status = ORTE_SUCCESS; opal_buffer_t *buffer = NULL; orte_sstore_central_cmd_flag_t command; orte_std_cntr_t count; orte_sstore_base_handle_t loc_id; orte_rml_recv_cb_t* rb = NULL; buffer = OBJ_NEW(opal_buffer_t); /* * Ask the daemon to send us the info that we need */ command = ORTE_SSTORE_CENTRAL_PULL; if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &command, 1, ORTE_SSTORE_CENTRAL_CMD))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &(handle_info->id), 1, ORTE_SSTORE_HANDLE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer, ORTE_RML_TAG_SSTORE_INTERNAL, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* buffer should not be released here; the callback releases it */ buffer = NULL; /* * Receive the response */ OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle, "sstore:central:(app): pull() from %s -> %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON))); rb = OBJ_NEW(orte_rml_recv_cb_t); rb->active = true; orte_rml.recv_buffer_nb(ORTE_PROC_MY_DAEMON, ORTE_RML_TAG_SSTORE_INTERNAL, 0, orte_rml_recv_callback, rb); ORTE_WAIT_FOR_COMPLETION(rb->active); count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &command, &count, ORTE_SSTORE_CENTRAL_CMD))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &loc_id, &count, ORTE_SSTORE_HANDLE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( loc_id != handle_info->id ) { ; /* JJH Big problem */ } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->seq_num), &count, OPAL_INT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->global_ref_name), &count, OPAL_STRING))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->local_location), &count, OPAL_STRING))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } count = 1; if (ORTE_SUCCESS != (ret = opal_dss.unpack(&rb->data, &(handle_info->metadata_filename), &count, OPAL_STRING))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } OPAL_OUTPUT_VERBOSE((10, mca_sstore_central_component.super.output_handle, "sstore:central:(app): pull() from %s -> %s (%d, %d, %s)", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON), handle_info->id, handle_info->seq_num, handle_info->global_ref_name )); cleanup: if (NULL != buffer) { OBJ_RELEASE(buffer); buffer = NULL; } if (NULL != rb) { OBJ_RELEASE(rb); buffer = NULL; } return exit_status; }
int main(int argc, char *argv[]) { orcm_alloc_t alloc, *aptr; orte_rml_recv_cb_t xfer; opal_buffer_t *buf; int rc, n; orcm_scd_cmd_flag_t command=ORCM_SESSION_REQ_COMMAND; orcm_alloc_id_t id; struct timeval tv; /* initialize, parse command line, and setup frameworks */ orcm_osub_init(argc, argv); /* create an allocation request */ OBJ_CONSTRUCT(&alloc, orcm_alloc_t); alloc.priority = 1; // session priority alloc.account = orcm_osub_globals.account; // account to be charged alloc.name = orcm_osub_globals.name; // user-assigned project name alloc.gid = orcm_osub_globals.gid; // group id to be run under alloc.max_nodes = orcm_osub_globals.max_nodes; // max number of nodes alloc.max_pes = orcm_osub_globals.max_pes; // max number of processing elements alloc.min_nodes = orcm_osub_globals.min_nodes; // min number of nodes required alloc.min_pes = orcm_osub_globals.min_pes; // min number of pe's required alloc.exclusive = orcm_osub_globals.exclusive; // true if nodes to be exclusively allocated (i.e., not shared across sessions) alloc.interactive = orcm_osub_globals.interactive; // true if in interactive mode alloc.nodes = '\0'; // regex of nodes to be used alloc.parent_name = ORTE_NAME_PRINT(ORTE_PROC_MY_NAME); // my_daemon_name alloc.parent_uri = '\0'; // my_daemon uri address /* alloc.constraints = orcm_osub_globals.resources */ ; // list of resource constraints to be applied when selecting hosts alloc.hnpname = '\0'; //my hnp name alloc.hnpuri = '\0'; //my hnp uri alloc.caller_uid = getuid(); // caller uid, not from args alloc.caller_gid = getgid(); // caller gid, not from args if (NULL == orcm_osub_globals.starttime || 0 == strlen(orcm_osub_globals.starttime)) { gettimeofday(&tv,NULL); /* desired start time for allocation deafults to now */ alloc.begin = tv.tv_sec; } else { /* TODO: eventually parse the string to figure out what user means, for now its now */ gettimeofday(&tv,NULL); alloc.begin = tv.tv_sec; } if (NULL == orcm_osub_globals.walltime || 0 == strlen(orcm_osub_globals.walltime)) { /* desired walltime default to 10 min */ alloc.walltime = 600; } else { /* get this in seconds for now, but will be parsed for more complexity later */ alloc.walltime = (time_t)strtol(orcm_osub_globals.walltime, NULL, 10); // max execution time } /* setup to receive the result */ OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t); xfer.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORCM_RML_TAG_SCD, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &xfer); /* send it to the scheduler */ buf = OBJ_NEW(opal_buffer_t); /* pack the alloc command flag */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &command,1, ORCM_SCD_CMD_T))) { ORTE_ERROR_LOG(rc); return rc; } aptr = &alloc; if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &aptr, 1, ORCM_ALLOC))) { ORTE_ERROR_LOG(rc); return rc; } if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_SCHEDULER, buf, ORCM_RML_TAG_SCD, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; } /* get our allocated jobid */ n=1; ORTE_WAIT_FOR_COMPLETION(xfer.active); if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &id, &n, ORCM_ALLOC_ID_T))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } opal_output(0, "RECEIVED ALLOC ID %d", (int)id); if (ORTE_SUCCESS != orcm_finalize()) { fprintf(stderr, "Failed orcm_finalize\n"); exit(1); } return ORTE_SUCCESS; }
static int rte_ft_event(int state) { int ret, exit_status = ORTE_SUCCESS; orte_proc_type_t svtype; orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); coll.id = orte_process_info.peer_init_barrier; /******** Checkpoint Prep ********/ if(OPAL_CRS_CHECKPOINT == state) { /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CHECKPOINT))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } /******** Continue Recovery ********/ else if (OPAL_CRS_CONTINUE == state ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d) - %s is Continuing", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_CONTINUE))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if( orte_cr_continue_like_restart ) { /* * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "ess:env: ft_event(%2d): Failed in orte_grpcomm.barrier (%d)", state, ret); exit_status = ret; goto cleanup; } ORTE_WAIT_FOR_COMPLETION(coll.active); if( orte_cr_flush_restart_files ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d): %s " "Cleanup restart files...", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); opal_crs_base_cleanup_flush(); } } } /******** Restart Recovery ********/ else if (OPAL_CRS_RESTART == state ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d) - %s is Restarting", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* * This should follow the ess init() function */ /* * Clear nidmap and jmap */ orte_util_nidmap_finalize(); /* * - Reset Contact information */ if( ORTE_SUCCESS != (ret = env_set_name() ) ) { exit_status = ret; } /* * Notify RML -> OOB */ if( ORTE_SUCCESS != (ret = orte_rml.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Restart the routed framework * JJH: Lie to the finalize function so it does not try to contact the daemon. */ svtype = orte_process_info.proc_type; orte_process_info.proc_type = ORTE_PROC_TOOL; if (ORTE_SUCCESS != (ret = orte_routed.finalize()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } orte_process_info.proc_type = svtype; if (ORTE_SUCCESS != (ret = orte_routed.initialize()) ) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Group Comm - Clean out stale data */ orte_grpcomm.finalize(); if (ORTE_SUCCESS != (ret = orte_grpcomm.init())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_db.remove(NULL, NULL))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Restart the PLM - Does nothing at the moment, but included for completeness */ if (ORTE_SUCCESS != (ret = orte_plm.finalize())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * RML - Enable communications */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Notify Routed */ if( ORTE_SUCCESS != (ret = orte_routed.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* if one was provided, build my nidmap */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } /* * Barrier to make all processes have been successfully restarted before * we try to remove some restart only files. */ if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { opal_output(0, "ess:env ft_event(%2d): Failed in orte_grpcomm.barrier (%d)", state, ret); exit_status = ret; goto cleanup; } ORTE_WAIT_FOR_COMPLETION(coll.active); if( orte_cr_flush_restart_files ) { OPAL_OUTPUT_VERBOSE((1, orte_ess_base_output, "ess:env ft_event(%2d): %s " "Cleanup restart files...", state, ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); opal_crs_base_cleanup_flush(); } /* * Session directory re-init */ if (orte_create_session_dirs) { if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, /* Batch ID -- Not used */ ORTE_PROC_MY_NAME))) { exit_status = ret; } opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } /* * Notify SnapC */ if( ORTE_SUCCESS != (ret = orte_snapc.ft_event(OPAL_CRS_RESTART))) { ORTE_ERROR_LOG(ret); exit_status = ret; goto cleanup; } } else if (OPAL_CRS_TERM == state ) { /* Nothing */ } else { /* Error state = Nothing */ } cleanup: OBJ_DESTRUCT(&coll); return exit_status; }
int main(int argc, char *argv[]){ int count; int msgsize; uint8_t *msg; int i, j, rc; orte_process_name_t peer; double maxpower; opal_buffer_t *buf; orte_rml_recv_cb_t blob; int conduit_id = 0; //use the first available conduit struct timeval start, end; opal_list_t *conduit_attr; /* * Init */ orte_init(&argc, &argv, ORTE_PROC_NON_MPI); conduit_attr = OBJ_NEW(opal_list_t); if( ORTE_SUCCESS == ( orte_set_attribute( conduit_attr, ORTE_RML_PROVIDER_ATTRIB, ORTE_ATTR_GLOBAL,"sockets",OPAL_STRING))) { if( ORTE_SUCCESS == ( orte_set_attribute( conduit_attr, ORTE_RML_INCLUDE_COMP_ATTRIB, ORTE_ATTR_GLOBAL,"ofi",OPAL_STRING))) { opal_output(0, "%s calling open_conduit with ORTE_RML_INCLUDE_COMP_ATTRIB and ORTE_RML_OFI_PROV_NAME_ATTRIB", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); conduit_id = orte_rml_API_open_conduit(conduit_attr); if (0 > conduit_id ) { opal_output(0, "Conduit could not be opened for OFI, exiting"); return; } } } opal_output(0, "Using conduit-id %d ", conduit_id); if (argc > 1) { count = atoi(argv[1]); if (count < 0) { count = INT_MAX-1; } } else { count = MAX_COUNT; } peer.jobid = ORTE_PROC_MY_NAME->jobid; peer.vpid = ORTE_PROC_MY_NAME->vpid + 1; if (peer.vpid == orte_process_info.num_procs) { peer.vpid = 0; } gettimeofday(&start, NULL); for (j=1; j < count+1; j++) { /* rank0 starts ring */ if (ORTE_PROC_MY_NAME->vpid == 0) { /* setup the initiating buffer - put random sized message in it */ buf = OBJ_NEW(opal_buffer_t); maxpower = (double)(j%7); msgsize = (int)pow(10.0, maxpower); opal_output(0, "Ring %d message size %d bytes", j, msgsize); msg = (uint8_t*)malloc(msgsize); opal_dss.pack(buf, msg, msgsize, OPAL_BYTE); free(msg); orte_rml.send_buffer_nb(conduit_id,&peer, buf, MY_TAG, orte_rml_send_callback, NULL); /* wait for it to come around */ OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); blob.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &blob); ORTE_WAIT_FOR_COMPLETION(blob.active); OBJ_DESTRUCT(&blob); opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j); } else { /* wait for msg */ OBJ_CONSTRUCT(&blob, orte_rml_recv_cb_t); blob.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &blob); ORTE_WAIT_FOR_COMPLETION(blob.active); opal_output(0, "%s received message %d from %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j, ORTE_NAME_PRINT(&blob.name)); /* send it along */ buf = OBJ_NEW(opal_buffer_t); opal_dss.copy_payload(buf, &blob.data); OBJ_DESTRUCT(&blob); msg_active = true; orte_rml.send_buffer_nb(conduit_id,&peer, buf, MY_TAG, send_callback, NULL); ORTE_WAIT_FOR_COMPLETION(msg_active); } } gettimeofday(&end, NULL); orte_finalize(); printf("start: %d secs, %d usecs\n",start.tv_sec,start.tv_usec); printf("end: %d secs, %d usecs\n",end.tv_sec,end.tv_usec); printf("Total minutes = %d, Total seconds = %d", (end.tv_sec - start.tv_sec)/60, (end.tv_sec - start.tv_sec) ); return 0; }
int orte_plm_proxy_spawn(orte_job_t *jdata) { opal_buffer_t *buf; orte_plm_cmd_flag_t command; int rc; orte_proxy_spawn_t *ps; OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:proxy spawn child job", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* if we are a singleton and the supporting HNP hasn't * been spawned, then do so now */ if ((orte_process_info.proc_type & ORTE_PROC_SINGLETON) && !orte_routing_is_enabled) { OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:proxy spawn HNP for support", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != orte_plm_base_fork_hnp()) { ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } orte_routing_is_enabled = true; /* need to init_routes again to redirect messages * thru the HNP */ orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL); } /* setup the buffer */ buf = OBJ_NEW(opal_buffer_t); /* tell the recipient we are sending a launch request */ command = ORTE_PLM_LAUNCH_JOB_CMD; if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORTE_PLM_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); goto CLEANUP; } /* pack the jdata object */ if (ORTE_SUCCESS != (rc = opal_dss.pack(buf, &jdata, 1, ORTE_JOB))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); goto CLEANUP; } /* create the proxy spawn object */ ps = OBJ_NEW(orte_proxy_spawn_t); /* post the recv the HNP's response */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_PLM_PROXY, ORTE_RML_NON_PERSISTENT, proxy_spawn_response, ps); /* tell the HNP to launch the job */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, buf, ORTE_RML_TAG_PLM, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_RELEASE(ps); goto CLEANUP; } OPAL_OUTPUT_VERBOSE((5, orte_plm_base_framework.framework_output, "%s plm:base:proxy waiting for response", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); ps->active = true; ORTE_WAIT_FOR_COMPLETION(ps->active); /* return the values */ jdata->jobid = ps->jobid; rc = ps->rc; /* cleanup the memory */ OBJ_RELEASE(ps); CLEANUP: return rc; }
int orte_routed_base_register_sync(bool setup) { opal_buffer_t *buffer; int rc; orte_daemon_cmd_flag_t command; char *rml_uri; uint8_t flag; bool sync_waiting; if (orte_abnormal_term_ordered) { /* if we are abnormally terminating, don't * even try to deregister from the daemon - there * is no guarantee we won't just hang in * the communication */ return ORTE_SUCCESS; } OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s %s with daemon %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_DAEMON), setup ? "registering" : "deregistering")); /* we need to get the oob to establish * the connection - the oob will leave the connection "alive" * thereafter so we can communicate readily */ buffer = OBJ_NEW(opal_buffer_t); if (setup) { /* tell the daemon to send back a nidmap */ command = ORTE_DAEMON_SYNC_WANT_NIDMAP; if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } /* add our contact info to the buffer so the daemon can explicitly * store it */ rml_uri = orte_rml.get_contact_info(); if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &rml_uri, 1, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); free(rml_uri); return rc; } if (NULL != rml_uri) free(rml_uri); /* tell the daemon if we are an MPI proc */ if (ORTE_PROC_IS_MPI) { flag = 1; } else { flag = 0; } if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &flag, 1, OPAL_UINT8))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } } else { /* deregister with the daemon */ command = ORTE_DAEMON_SYNC_BY_PROC; if (ORTE_SUCCESS != (rc = opal_dss.pack(buffer, &command, 1, ORTE_DAEMON_CMD))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buffer); return rc; } } /* setup to receive the response */ sync_waiting = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_SYNC, ORTE_RML_NON_PERSISTENT, report_sync, &sync_waiting); /* send the sync command to our daemon */ if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_DAEMON, buffer, ORTE_RML_TAG_DAEMON, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); return rc; } OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s registering sync waiting for ack", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* get the ack - includes the nidmap */ ORTE_WAIT_FOR_COMPLETION(sync_waiting); OPAL_OUTPUT_VERBOSE((5, orte_routed_base_framework.framework_output, "%s registering sync ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
static int rte_init(void) { int ret; char *error = NULL; char **hosts = NULL; /* run the prolog */ if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) { error = "orte_ess_base_std_prolog"; goto error; } /* Start by getting a unique name from the enviro */ env_set_name(); /* if I am a daemon, complete my setup using the * default procedure */ if (ORTE_PROC_IS_DAEMON) { if (NULL != orte_node_regex) { /* extract the nodes */ if (ORTE_SUCCESS != (ret = orte_regex_extract_node_names(orte_node_regex, &hosts))) { error = "orte_regex_extract_node_names"; goto error; } } if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup(hosts))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_orted_setup"; goto error; } opal_argv_free(hosts); return ORTE_SUCCESS; } if (ORTE_PROC_IS_TOOL) { /* otherwise, if I am a tool proc, use that procedure */ if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_tool_setup"; goto error; } /* as a tool, I don't need a nidmap - so just return now */ return ORTE_SUCCESS; } /* use the default procedure to finish my setup */ if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup(true))) { ORTE_ERROR_LOG(ret); error = "orte_ess_base_app_setup"; goto error; } /* if data was provided, update the database */ if (ORTE_SUCCESS != (ret = orte_util_nidmap_init(orte_process_info.sync_buf))) { ORTE_ERROR_LOG(ret); error = "orte_util_nidmap_init"; goto error; } /* setup process binding */ if (ORTE_SUCCESS != (ret = orte_ess_base_proc_binding())) { error = "proc_binding"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to exchange our connection info here. * MPI_Init has its own modex, so we don't need to do * two of them. However, if we don't do a modex at all, * then processes have no way to communicate * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); coll.id = orte_process_info.peer_modex; coll.active = true; if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(&coll))) { ORTE_ERROR_LOG(ret); error = "orte modex"; goto error; } ORTE_WAIT_FOR_COMPLETION(coll.active); OBJ_DESTRUCT(&coll); } return ORTE_SUCCESS; error: if (ORTE_ERR_SILENT != ret && !orte_report_silent_errors) { orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); } return ret; }
int orte_ess_base_app_setup(void) { int ret; char *error = NULL; /* * stdout/stderr buffering * If the user requested to override the default setting then do * as they wish. */ if( orte_ess_base_std_buffering > -1 ) { if( 0 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IONBF, 0); setvbuf(stderr, NULL, _IONBF, 0); } else if( 1 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOLBF, 0); setvbuf(stderr, NULL, _IOLBF, 0); } else if( 2 == orte_ess_base_std_buffering ) { setvbuf(stdout, NULL, _IOFBF, 0); setvbuf(stderr, NULL, _IOFBF, 0); } } /* open and setup the state machine */ if (ORTE_SUCCESS != (ret = orte_state_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_state_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_state_base_select"; goto error; } /* open the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_open"; goto error; } /* Setup the communication infrastructure */ /* Runtime Messaging Layer */ if (ORTE_SUCCESS != (ret = orte_rml_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_rml_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_rml_base_select"; goto error; } /* setup the errmgr */ if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_errmgr_base_select"; goto error; } /* Routed system */ if (ORTE_SUCCESS != (ret = orte_routed_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_routed_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_routed_base_select"; goto error; } /* database */ if (ORTE_SUCCESS != (ret = orte_db_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_db_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_db_base_select"; goto error; } /* * Group communications */ if (ORTE_SUCCESS != (ret = orte_grpcomm_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) { ORTE_ERROR_LOG(ret); error = "orte_grpcomm_base_select"; goto error; } /* non-daemon/HNP apps can only have the default proxy PLM * module open - provide a chance for it to initialize */ if (ORTE_SUCCESS != (ret = orte_plm.init())) { ORTE_ERROR_LOG(ret); error = "orte_plm_init"; goto error; } /* enable communication via the rml */ if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) { ORTE_ERROR_LOG(ret); error = "orte_rml.enable_comm"; goto error; } /* setup my session directory */ if (orte_create_session_dirs) { OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output, "%s setting up session dir with\n\ttmpdir: %s\n\thost %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (NULL == orte_process_info.tmpdir_base) ? "UNDEF" : orte_process_info.tmpdir_base, orte_process_info.nodename)); if (ORTE_SUCCESS != (ret = orte_session_dir(true, orte_process_info.tmpdir_base, orte_process_info.nodename, NULL, ORTE_PROC_MY_NAME))) { ORTE_ERROR_LOG(ret); error = "orte_session_dir"; goto error; } /* Once the session directory location has been established, set the opal_output env file location to be in the proc-specific session directory. */ opal_output_set_output_file_info(orte_process_info.proc_session_dir, "output-", NULL, NULL); } /* setup the routed info - the selected routed component * will know what to do. Some may put us in a blocking * receive here so they can get ALL of the contact info * from our peers. Others may just find the local daemon's * contact info and immediately return. */ if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) { ORTE_ERROR_LOG(ret); error = "orte_routed.init_routes"; goto error; } #if OPAL_ENABLE_FT_CR == 1 /* * Setup the SnapC */ if (ORTE_SUCCESS != (ret = orte_snapc_base_open())) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_open"; goto error; } if (ORTE_SUCCESS != (ret = orte_snapc_base_select(ORTE_PROC_IS_HNP, !ORTE_PROC_IS_DAEMON))) { ORTE_ERROR_LOG(ret); error = "orte_snapc_base_select"; goto error; } /* apps need the OPAL CR stuff */ opal_cr_set_enabled(true); #else opal_cr_set_enabled(false); #endif /* Initalize the CR setup * Note: Always do this, even in non-FT builds. * If we don't some user level tools may hang. */ if (ORTE_SUCCESS != (ret = orte_cr_init())) { ORTE_ERROR_LOG(ret); error = "orte_cr_init"; goto error; } /* if we are an ORTE app - and not an MPI app - then * we need to barrier here. MPI_Init has its own barrier, * so we don't need to do two of them. However, if we * don't do a barrier at all, then one process could * finalize before another one called orte_init. This * causes ORTE to believe that the proc abnormally * terminated * * NOTE: only do this when the process originally launches. * Cannot do this on a restart as the rest of the processes * in the job won't be executing this step, so we would hang */ if (ORTE_PROC_IS_NON_MPI && !orte_do_not_barrier) { orte_grpcomm_collective_t coll; OBJ_CONSTRUCT(&coll, orte_grpcomm_collective_t); coll.id = orte_process_info.peer_init_barrier; if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(&coll))) { ORTE_ERROR_LOG(ret); error = "orte barrier"; goto error; } ORTE_WAIT_FOR_COMPLETION(coll.active); OBJ_DESTRUCT(&coll); } return ORTE_SUCCESS; error: orte_show_help("help-orte-runtime.txt", "orte_init:startup:internal-failure", true, error, ORTE_ERROR_NAME(ret), ret); return ret; }
int mca_pml_ob1_ft_event( int state ) { static bool first_continue_pass = false; ompi_proc_t** procs = NULL; size_t num_procs; int ret, p; orte_grpcomm_collective_t *coll, *modex; coll = OBJ_NEW(orte_grpcomm_collective_t); coll->id = orte_process_info.peer_init_barrier; if(OPAL_CRS_CHECKPOINT == state) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCPBR1); orte_grpcomm.barrier(coll); ORTE_WAIT_FOR_COMPLETION(coll->active); } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P0); } else if(OPAL_CRS_CONTINUE == state) { first_continue_pass = !first_continue_pass; if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_COREBR0); orte_grpcomm.barrier(coll); ORTE_WAIT_FOR_COMPLETION(coll->active); } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P2); } if( orte_cr_continue_like_restart && !first_continue_pass ) { /* * Get a list of processes */ procs = ompi_proc_all(&num_procs); if(NULL == procs) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto clean; } /* * Refresh the proc structure, and publish our proc info in the modex. * NOTE: Do *not* call ompi_proc_finalize as there are many places in * the code that point to indv. procs in this strucutre. For our * needs here we only need to fix up the modex, bml and pml * references. */ if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { opal_output(0, "pml:ob1: ft_event(Restart): proc_refresh Failed %d", ret); for(p = 0; p < (int)num_procs; ++p) { OBJ_RELEASE(procs[p]); } free (procs); goto clean; } } } else if(OPAL_CRS_RESTART_PRE == state ) { /* Nothing here */ } else if(OPAL_CRS_RESTART == state ) { /* * Get a list of processes */ procs = ompi_proc_all(&num_procs); if(NULL == procs) { ret = OMPI_ERR_OUT_OF_RESOURCE; goto clean; } /* * Clean out the modex information since it is invalid now. * orte_grpcomm.purge_proc_attrs(); * This happens at the ORTE level, so doing it again here will cause * some issues with socket caching. */ /* * Refresh the proc structure, and publish our proc info in the modex. * NOTE: Do *not* call ompi_proc_finalize as there are many places in * the code that point to indv. procs in this strucutre. For our * needs here we only need to fix up the modex, bml and pml * references. */ if (OMPI_SUCCESS != (ret = ompi_proc_refresh())) { opal_output(0, "pml:ob1: ft_event(Restart): proc_refresh Failed %d", ret); for(p = 0; p < (int)num_procs; ++p) { OBJ_RELEASE(procs[p]); } free (procs); goto clean; } } else if(OPAL_CRS_TERM == state ) { ; } else { ; } /* Call the BML * BML is expected to call ft_event in * - BTL(s) * - MPool(s) */ if( OMPI_SUCCESS != (ret = mca_bml.bml_ft_event(state))) { opal_output(0, "pml:base: ft_event: BML ft_event function failed: %d\n", ret); } if(OPAL_CRS_CHECKPOINT == state) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P1); if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR0); /* JJH Cannot barrier here due to progress engine -- orte_grpcomm.barrier();*/ } } else if(OPAL_CRS_CONTINUE == state) { if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR1); orte_grpcomm.barrier(coll); ORTE_WAIT_FOR_COMPLETION(coll->active); } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2P3); } if( orte_cr_continue_like_restart && !first_continue_pass ) { /* * Exchange the modex information once again. * BTLs will have republished their modex information. */ modex = OBJ_NEW(orte_grpcomm_collective_t); modex->id = orte_process_info.peer_modex; if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) { opal_output(0, "pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d", ret); OBJ_RELEASE(modex); goto clean; } ORTE_WAIT_FOR_COMPLETION(modex->active); OBJ_RELEASE(modex); /* * Startup the PML stack now that the modex is running again * Add the new procs (BTLs redo modex recv's) */ if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) { opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret); goto clean; } /* Is this barrier necessary ? JJH */ if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); goto clean; } ORTE_WAIT_FOR_COMPLETION(coll->active); if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { OBJ_RELEASE(procs[p]); } free(procs); procs = NULL; } } if( !first_continue_pass ) { if( opal_cr_timing_barrier_enabled ) { OPAL_CR_SET_TIMER(OPAL_CR_TIMER_P2PBR2); orte_grpcomm.barrier(coll); ORTE_WAIT_FOR_COMPLETION(coll->active); } OPAL_CR_SET_TIMER(OPAL_CR_TIMER_CRCP1); } } else if(OPAL_CRS_RESTART_PRE == state ) { /* Nothing here */ } else if(OPAL_CRS_RESTART == state ) { /* * Exchange the modex information once again. * BTLs will have republished their modex information. */ modex = OBJ_NEW(orte_grpcomm_collective_t); modex->id = orte_process_info.peer_modex; if (OMPI_SUCCESS != (ret = orte_grpcomm.modex(modex))) { opal_output(0, "pml:ob1: ft_event(Restart): Failed orte_grpcomm.modex() = %d", ret); OBJ_RELEASE(modex); goto clean; } ORTE_WAIT_FOR_COMPLETION(modex->active); OBJ_RELEASE(modex); /* * Startup the PML stack now that the modex is running again * Add the new procs (BTLs redo modex recv's) */ if( OMPI_SUCCESS != (ret = mca_pml_ob1_add_procs(procs, num_procs) ) ) { opal_output(0, "pml:ob1: ft_event(Restart): Failed in add_procs (%d)", ret); goto clean; } /* Is this barrier necessary ? JJH */ if (OMPI_SUCCESS != (ret = orte_grpcomm.barrier(coll))) { opal_output(0, "pml:ob1: ft_event(Restart): Failed in orte_grpcomm.barrier (%d)", ret); goto clean; } ORTE_WAIT_FOR_COMPLETION(coll->active); if( NULL != procs ) { for(p = 0; p < (int)num_procs; ++p) { OBJ_RELEASE(procs[p]); } free(procs); procs = NULL; } } else if(OPAL_CRS_TERM == state ) { ; } else { ; } ret = OMPI_SUCCESS; clean: OBJ_RELEASE(coll); return ret; }
static int init_routes(orte_jobid_t job, opal_buffer_t *ndat) { int rc; /* if I am a tool, then I stand alone - there is nothing to do */ if (ORTE_PROC_IS_TOOL) { return ORTE_SUCCESS; } /* if I am a daemon or HNP, then I have to extract the routing info for this job * from the data sent to me for launch and update the routing tables to * point at the daemon for each proc */ if (ORTE_PROC_IS_DAEMON) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s direct: init routes for daemon job %s\n\thnp_uri %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job), (NULL == orte_process_info.my_hnp_uri) ? "NULL" : orte_process_info.my_hnp_uri)); if (NULL == ndat) { /* indicates this is being called during orte_init. * Get the HNP's name for possible later use */ if (NULL == orte_process_info.my_hnp_uri) { /* fatal error */ ORTE_ERROR_LOG(ORTE_ERR_FATAL); return ORTE_ERR_FATAL; } /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* set the contact info into the hash table */ orte_rml.set_contact_info(orte_process_info.my_hnp_uri); /* the HNP is my lifeline */ lifeline = ORTE_PROC_MY_HNP; /* daemons will send their contact info back to the HNP as * part of the message confirming they are read to go. HNP's * load their contact info during orte_init */ } else { /* ndat != NULL means we are getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); } return rc; } OPAL_OUTPUT_VERBOSE((2, orte_routed_base_framework.framework_output, "%s routed_direct: completed init routes", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; } if (ORTE_PROC_IS_HNP) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes for HNP job %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_JOBID_PRINT(job))); if (NULL != ndat) { /* if this is for my own jobid, then I am getting an update of RML info * for the daemons - so update our contact info and routes */ if (ORTE_PROC_MY_NAME->jobid == job) { if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } } return ORTE_SUCCESS; } /*** MUST BE A PROC ***/ if (NULL == ndat) { /* if we were direct launched, there is nothing we need to do. If we * were launched by mpirun, then we need to set the HNP and daemon info */ if (NULL != orte_process_info.my_hnp_uri) { /* extract the hnp name and store it */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_hnp_uri, ORTE_PROC_MY_HNP, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* we don't set the HNP's contact info as we don't need it - we * only contact our local daemon, which might be the HNP (in which * case it will have also been passed as our daemon uri) */ } if (NULL != orte_process_info.my_daemon_uri) { /* extract the daemon's name so we can update the routing table */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(orte_process_info.my_daemon_uri, ORTE_PROC_MY_DAEMON, NULL))) { ORTE_ERROR_LOG(rc); return rc; } orte_rml.set_contact_info(orte_process_info.my_daemon_uri); /* my daemon is my lifeline */ lifeline = ORTE_PROC_MY_DAEMON; } return ORTE_SUCCESS; } /* if ndat != NULL, then this is being invoked by the proc to * init a route to a specified process that is outside of our * job family. We want that route to go through our HNP, routed via * out local daemon - however, we cannot know for * certain that the HNP already knows how to talk to the specified * procs. For example, in OMPI's publish/subscribe procedures, the * DPM framework looks for an mca param containing the global ompi-server's * uri. This info will come here so the proc can setup a route to * the server - we need to pass the routing info to our HNP. * * Obviously, if we were direct launched, we won't have an HNP, in * which case we just update our own contact info and go direct */ if (NULL == orte_process_info.my_hnp_uri) { OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes w/non-NULL data and direct launched", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(ndat))) { ORTE_ERROR_LOG(rc); return rc; } } else { opal_buffer_t *xfer; orte_rml_cmd_flag_t cmd=ORTE_RML_UPDATE_CMD; bool ack_waiting; OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct: init routes w/non-NULL data", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); if (ORTE_JOB_FAMILY(ORTE_PROC_MY_NAME->jobid) != ORTE_JOB_FAMILY(job)) { /* if this is for a different job family, then we route via our HNP * to minimize connection counts to entities such as ompi-server, so * start by sending the contact info to the HNP for update */ OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct_init_routes: diff job family - sending update to %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(ORTE_PROC_MY_HNP))); /* prep the buffer for transmission to the HNP */ xfer = OBJ_NEW(opal_buffer_t); opal_dss.pack(xfer, &cmd, 1, ORTE_RML_CMD); opal_dss.copy_payload(xfer, ndat); /* save any new connections for use in subsequent connect_accept calls */ orte_routed_base_update_hnps(ndat); if (0 > (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_HNP, xfer, ORTE_RML_TAG_RML_INFO_UPDATE, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(xfer); return rc; } /* wait right here until the HNP acks the update to ensure that * any subsequent messaging can succeed */ ack_waiting = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_UPDATE_ROUTE_ACK, ORTE_RML_NON_PERSISTENT, recv_ack, &ack_waiting); ORTE_WAIT_FOR_COMPLETION(ack_waiting); OPAL_OUTPUT_VERBOSE((1, orte_routed_base_framework.framework_output, "%s routed_direct_init_routes: ack recvd", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); /* our get_route function automatically routes all messages for * other job families via the HNP, so nothing more to do here */ } } return ORTE_SUCCESS; }
int orcm_octl_queue_status(char **argv) { orcm_alloc_t **allocs; orte_rml_recv_cb_t xfer; opal_buffer_t *buf; int rc, i, j, n, num_queues, num_sessions; orcm_scd_cmd_flag_t command; char *name; /* setup to receive the result */ OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t); xfer.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORCM_RML_TAG_SCD, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &xfer); buf = OBJ_NEW(opal_buffer_t); command = ORCM_SESSION_INFO_COMMAND; /* pack the session info command flag */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORCM_SCD_CMD_T))) { ORTE_ERROR_LOG(rc); return rc; } /* send it to the scheduler */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_SCHEDULER, buf, ORCM_RML_TAG_SCD, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; } /* unpack number of queues */ ORTE_WAIT_FOR_COMPLETION(xfer.active); n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &num_queues, &n, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } printf("********\nQUEUES\n********\n"); /* for each queue */ for (i = 0; i < num_queues; i++) { /* get the name */ n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &name, &n, OPAL_STRING))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } /* get the number of sessions on the queue */ n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &num_sessions, &n, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } printf("%s (%i sessions)\n----------\n", name, num_sessions); if (0 < num_sessions) { allocs = (orcm_alloc_t**)malloc(num_sessions * sizeof(orcm_alloc_t*)); if (NULL == allocs) { return ORCM_ERR_OUT_OF_RESOURCE; } /* get the sessions on the queue */ if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, allocs, &num_sessions, ORCM_ALLOC))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } /* loop through sessions, and print them */ for (j = 0; j < num_sessions; j++) { printf("%d\t%u|%u\t%i\t%s\t%s\n", (int)allocs[j]->id, allocs[j]->caller_uid, allocs[j]->caller_gid, allocs[j]->min_nodes, allocs[j]->exclusive ? "EX" : "SH", allocs[j]->interactive ? "I" : "B" ); OBJ_DESTRUCT(allocs[j]); } free(allocs); } } OBJ_DESTRUCT(&xfer); return ORTE_SUCCESS; }
int main(int argc, char *argv[]) { orte_rml_recv_cb_t xfer; opal_buffer_t *buf; int rc, i, n, wfid; orcm_analytics_cmd_flag_t command; FILE *fp; opal_value_t *oflow_value; opal_value_t **oflow_array = NULL; orte_process_name_t wf_agg; /* initialize, parse command line, and setup frameworks */ orcm_oflow_init(argc, argv); if (NULL == (fp = fopen(orcm_oflow_globals.file, "r"))) { perror("Can't open workflow file"); if (ORTE_SUCCESS != orcm_finalize()) { fprintf(stderr, "Failed orcm_finalize\n"); exit(1); } return ORCM_ERR_BAD_PARAM; } i = 0; oflow_value = oflow_parse_next_line(fp); while(oflow_value) { if (0 == strncmp("VPID", oflow_value->key, ORCM_MAX_LINE_LENGTH)) { wf_agg.jobid = 0; wf_agg.vpid = (orte_vpid_t)strtol(oflow_value->data.string, (char **)NULL, 10); printf("Sending to %s\n", ORTE_NAME_PRINT(&wf_agg)); free(oflow_value); oflow_value = oflow_parse_next_line(fp); continue; } printf("KEY: %s \n\tVALUE: %s\n", oflow_value->key, oflow_value->data.string); oflow_array = (opal_value_t**)realloc(oflow_array, (sizeof(oflow_array) + sizeof(opal_value_t*))); if (!oflow_array) { fclose(fp); free(oflow_value); return ORCM_ERR_OUT_OF_RESOURCE; } oflow_array[i] = oflow_value; oflow_value = oflow_parse_next_line(fp); i++; } fclose(fp); /* setup to receive the result */ OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t); xfer.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORCM_RML_TAG_ANALYTICS, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &xfer); /* setup to recieve workflow output */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, 12345, ORTE_RML_PERSISTENT, orcm_oflow_recv, NULL); buf = OBJ_NEW(opal_buffer_t); command = ORCM_ANALYTICS_WORKFLOW_CREATE; /* pack the alloc command flag */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, OPAL_UINT8))) { goto ERROR; } /* pack the length of the array */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &i, 1, OPAL_INT))) { goto ERROR; } if (oflow_array) { /* pack the array */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, oflow_array, i, OPAL_VALUE))) { goto ERROR; } } /* send it to the aggregator */ if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(&wf_agg, buf, ORCM_RML_TAG_ANALYTICS, orte_rml_send_callback, NULL))) { goto ERROR; } /* unpack workflow id */ ORTE_WAIT_FOR_COMPLETION(xfer.active); n=1; if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &wfid, &n, OPAL_INT))) { goto ERROR; } printf("Workflow created with id: %i\n", wfid); OBJ_DESTRUCT(&xfer); if (ORTE_SUCCESS != orcm_finalize()) { fprintf(stderr, "Failed orcm_finalize\n"); exit(1); } return ORTE_SUCCESS; ERROR: if (NULL != oflow_array) { for (n = 0; n < i; n++) { OBJ_RELEASE(oflow_array[n]); } free(oflow_array); } ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; }
int orcm_ocli_session_cancel(char **argv) { orcm_scd_cmd_flag_t command; orcm_alloc_id_t id; opal_buffer_t *buf; orte_rml_recv_cb_t xfer; long session; int rc, n, result; if (3 != opal_argv_count(argv)) { fprintf(stderr, "incorrect arguments to \"session cancel\"\n"); return ORCM_ERR_BAD_PARAM; } session = strtol(argv[2], NULL, 10); // FIXME: validate session id better if (session > 0) { /* setup to receive the result */ OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t); xfer.active = true; orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORCM_RML_TAG_SCD, ORTE_RML_NON_PERSISTENT, orte_rml_recv_callback, &xfer); /* send it to the scheduler */ buf = OBJ_NEW(opal_buffer_t); command = ORCM_SESSION_CANCEL_COMMAND; /* pack the cancel command flag */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &command, 1, ORCM_SCD_CMD_T))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; } id = (orcm_alloc_id_t)session; /* pack the session id */ if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &id, 1, ORCM_ALLOC_ID_T))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; } if (ORTE_SUCCESS != (rc = orte_rml.send_buffer_nb(ORTE_PROC_MY_SCHEDULER, buf, ORCM_RML_TAG_SCD, orte_rml_send_callback, NULL))) { ORTE_ERROR_LOG(rc); OBJ_RELEASE(buf); OBJ_DESTRUCT(&xfer); return rc; } /* get result */ n=1; ORTE_WAIT_FOR_COMPLETION(xfer.active); if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &result, &n, OPAL_INT))) { ORTE_ERROR_LOG(rc); OBJ_DESTRUCT(&xfer); return rc; } if (0 == result) { printf("Success\n"); } else { printf("Failure\n"); } } else { fprintf(stderr, "Invalid SESSION ID\n"); return ORCM_ERROR; } return ORCM_SUCCESS; }