static int ack_open (void *qos_channel, opal_buffer_t * buf) { int32_t rc = ORTE_SUCCESS; uint32_t eviction_timeout; orte_qos_ack_channel_t *ack_chan; ack_chan = (orte_qos_ack_channel_t*) (qos_channel); /* TO DO - need to adjust eviction timeout according to window size lets keep max time out for the first pass */ eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000; /* init outstanding msg hotel */ opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS, orte_event_base, eviction_timeout, 0, orte_qos_ack_msg_ack_timeout_callback); OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_open channel = %p init hotel timeout =%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)ack_chan, eviction_timeout)); /* set the message window timer event, but don't activate it */ /*opal_event_set(opal_event_base, &ack_chan->msg_window_timer_event, -1, 0, orte_qos_ack_msg_window_timeout_callback, ack_chan); opal_event_set_priority(&ack_chan->msg_window_timer_event, ORTE_MSG_PRI);*/ /* the Qos module puts the non local attributes to be sent to the peer in a list at the time of create. pack those attributes into the buffer.*/ if (ORTE_SUCCESS != (rc = orte_qos_base_pack_attributes(buf, &ack_chan->attributes))) ORTE_ERROR_LOG(rc); return rc; }
static int ack_init_recv (void *channel, opal_list_t *attributes) { int32_t rc = ORTE_SUCCESS; uint32_t eviction_timeout; orte_qos_ack_channel_t *ack_chan; ack_chan = (orte_qos_ack_channel_t*) channel; /* TO DO - need to adjust eviction timeout according to window size lets keep max time out for the first pass */ eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000; /* init outstanding msg hotel */ opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS, orte_event_base, eviction_timeout, 0, orte_qos_ack_recv_msg_timeout_callback); OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output, "%s ack_open channel = %p init hotel timeout =%d", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (void*)ack_chan, eviction_timeout)); opal_event_evtimer_set (orte_event_base, &ack_chan->msg_ack_timer_event, orte_qos_ack_msg_window_timeout_callback, (void *) ack_chan); return rc; }
/* * Construct/destruct an endpoint structure. */ static void endpoint_construct(mca_btl_base_endpoint_t* endpoint) { int i; endpoint->endpoint_module = NULL; endpoint->endpoint_proc = NULL; endpoint->endpoint_proc_index = -1; endpoint->endpoint_exiting = false; for (i=0; i<USNIC_NUM_CHANNELS; ++i) { endpoint->endpoint_remote_addr.qp_num[i] = 0; } endpoint->endpoint_remote_addr.gid.global.subnet_prefix = 0; endpoint->endpoint_remote_addr.gid.global.interface_id = 0; endpoint->endpoint_remote_ah = NULL; endpoint->endpoint_send_credits = 8; /* list of fragments queued to be sent */ OBJ_CONSTRUCT(&endpoint->endpoint_frag_send_queue, opal_list_t); endpoint->endpoint_next_frag_id = 1; endpoint->endpoint_acktime = 0; endpoint->endpoint_rfstart = endpoint->endpoint_next_contig_seq_to_recv; /* endpoint starts not-ready-to-send */ endpoint->endpoint_ready_to_send = 0; endpoint->endpoint_ack_needed = false; /* clear sent/received sequence number array */ memset(endpoint->endpoint_sent_segs, 0, sizeof(endpoint->endpoint_sent_segs)); memset(endpoint->endpoint_rcvd_segs, 0, sizeof(endpoint->endpoint_rcvd_segs)); /* * Make a new OPAL hotel for this module * "hotel" is a construct used for triggering segment retransmission * due to timeout */ OBJ_CONSTRUCT(&endpoint->endpoint_hotel, opal_hotel_t); opal_hotel_init(&endpoint->endpoint_hotel, WINDOW_SIZE, mca_btl_usnic_component.retrans_timeout, 0, ompi_btl_usnic_ack_timeout); /* Setup this endpoint's list links */ OBJ_CONSTRUCT(&(endpoint->endpoint_ack_li), opal_list_item_t); OBJ_CONSTRUCT(&(endpoint->endpoint_endpoint_li), opal_list_item_t); endpoint->endpoint_ack_needed = false; /* fragment reassembly info */ endpoint->endpoint_rx_frag_info = calloc(sizeof(struct ompi_btl_usnic_rx_frag_info_t), MAX_ACTIVE_FRAGS); assert(NULL != endpoint->endpoint_rx_frag_info); if (OPAL_UNLIKELY(endpoint->endpoint_rx_frag_info == NULL)) { BTL_ERROR(("calloc returned NULL -- this should not happen!")); ompi_btl_usnic_exit(); /* Does not return */ } }
/* * Initialize global variables used w/in the server. */ int pmix_server_init(void) { int rc; opal_list_t info; opal_value_t *kv; if (orte_pmix_server_globals.initialized) { return ORTE_SUCCESS; } orte_pmix_server_globals.initialized = true; /* setup the server's state variables */ OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t); if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs, orte_pmix_server_globals.num_rooms, orte_event_base, orte_pmix_server_globals.timeout*1000000, ORTE_ERROR_PRI, eviction_cbfunc))) { ORTE_ERROR_LOG(rc); return rc; } OBJ_CONSTRUCT(&orte_pmix_server_globals.notifications, opal_list_t); /* setup recv for direct modex requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX, ORTE_RML_PERSISTENT, pmix_server_dmdx_recv, NULL); /* setup recv for replies to direct modex requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX_RESP, ORTE_RML_PERSISTENT, pmix_server_dmdx_resp, NULL); /* setup recv for replies to proxy launch requests */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP, ORTE_RML_PERSISTENT, pmix_server_launch_resp, NULL); /* setup recv for replies from data server */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT, ORTE_RML_PERSISTENT, pmix_server_keyval_client, NULL); /* setup recv for notifications */ orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFICATION, ORTE_RML_PERSISTENT, pmix_server_notify, NULL); /* ensure the PMIx server uses the proper rendezvous directory */ opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ); /* pass the server the local topology - we do this so the procs won't read the * topology themselves as this could overwhelm the local * system on large-scale SMPs */ OBJ_CONSTRUCT(&info, opal_list_t); if (NULL != opal_hwloc_topology) { char *xmlbuffer=NULL; int len; kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_LOCAL_TOPO); if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) { OBJ_RELEASE(kv); OBJ_DESTRUCT(&info); return ORTE_ERROR; } kv->data.string = xmlbuffer; kv->type = OPAL_STRING; opal_list_append(&info, &kv->super); } /* tell the server to allow tool connections */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SERVER_TOOL_SUPPORT); kv->type = OPAL_BOOL; kv->data.flag = true; opal_list_append(&info, &kv->super); /* tell the server our temp directory */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR); kv->type = OPAL_STRING; kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL); opal_list_append(&info, &kv->super); /* use the same for the system temp directory - this is * where the system-level tool connections will go */ kv = OBJ_NEW(opal_value_t); kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR); kv->type = OPAL_STRING; kv->data.string = strdup(orte_process_info.tmpdir_base); opal_list_append(&info, &kv->super); /* setup the local server */ if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) { /* pmix will provide a nice show_help output here */ return rc; } OPAL_LIST_DESTRUCT(&info); /* if the universal server wasn't specified, then we use * our own HNP for that purpose */ if (NULL == orte_pmix_server_globals.server_uri) { orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP; } else { char *server; opal_buffer_t buf; if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) || 0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) { char input[1024], *filename; FILE *fp; /* it is a file - get the filename */ filename = strchr(orte_pmix_server_globals.server_uri, ':'); if (NULL == filename) { /* filename is not correctly formatted */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } ++filename; /* space past the : */ if (0 >= strlen(filename)) { /* they forgot to give us the name! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } /* open the file and extract the uri */ fp = fopen(filename, "r"); if (NULL == fp) { /* can't find or read file! */ orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true, orte_basename, orte_pmix_server_globals.server_uri); return ORTE_ERR_BAD_PARAM; } if (NULL == fgets(input, 1024, fp)) { /* something malformed about file */ fclose(fp); orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true, orte_basename, orte_pmix_server_globals.server_uri, orte_basename); return ORTE_ERR_BAD_PARAM; } fclose(fp); input[strlen(input)-1] = '\0'; /* remove newline */ server = strdup(input); } else { server = strdup(orte_pmix_server_globals.server_uri); } /* setup our route to the server */ OBJ_CONSTRUCT(&buf, opal_buffer_t); opal_dss.pack(&buf, &server, 1, OPAL_STRING); if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) { ORTE_ERROR_LOG(rc); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return rc; } OBJ_DESTRUCT(&buf); /* parse the URI to get the server's name */ if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) { ORTE_ERROR_LOG(rc); return rc; } /* check if we are to wait for the server to start - resolves * a race condition that can occur when the server is run * as a background job - e.g., in scripts */ if (orte_pmix_server_globals.wait_for_server) { /* ping the server */ struct timeval timeout; timeout.tv_sec = orte_pmix_server_globals.timeout; timeout.tv_usec = 0; if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { /* try it one more time */ if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) { /* okay give up */ orte_show_help("help-orterun.txt", "orterun:server-not-found", true, orte_basename, server, (long)orte_pmix_server_globals.timeout, ORTE_ERROR_NAME(rc)); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return rc; } } } } return rc; }