Esempio n. 1
0
static int ack_open (void *qos_channel, opal_buffer_t * buf)  {
    int32_t rc = ORTE_SUCCESS;
    uint32_t eviction_timeout;
    orte_qos_ack_channel_t *ack_chan;
    ack_chan = (orte_qos_ack_channel_t*) (qos_channel);
    /* TO DO - need to adjust eviction timeout according to window size
       lets keep max time out for the first pass */
    eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000;
    /* init outstanding msg hotel */
    opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS,
                       orte_event_base, eviction_timeout, 0,
                       orte_qos_ack_msg_ack_timeout_callback);
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s ack_open channel = %p init hotel timeout =%d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)ack_chan, eviction_timeout));
    /* set the message window timer event, but don't activate it */
    /*opal_event_set(opal_event_base,
                   &ack_chan->msg_window_timer_event,
                   -1, 0, orte_qos_ack_msg_window_timeout_callback,
                   ack_chan);
    opal_event_set_priority(&ack_chan->msg_window_timer_event, ORTE_MSG_PRI);*/
    /* the Qos module puts the non local attributes  to be sent to the peer in a list at the time of create.
      pack those attributes into the buffer.*/
    if (ORTE_SUCCESS != (rc =  orte_qos_base_pack_attributes(buf, &ack_chan->attributes)))
        ORTE_ERROR_LOG(rc);
    return rc;
}
Esempio n. 2
0
static int ack_init_recv (void *channel, opal_list_t *attributes) {
    int32_t rc = ORTE_SUCCESS;
    uint32_t eviction_timeout;
    orte_qos_ack_channel_t *ack_chan;
    ack_chan = (orte_qos_ack_channel_t*) channel;
    /* TO DO - need to adjust eviction timeout according to window size
       lets keep max time out for the first pass */
    eviction_timeout = (ack_chan->timeout_secs + QOS_ACK_WINDOW_TIMEOUT_IN_SECS) * 100000;
    /* init outstanding msg hotel */
    opal_hotel_init (&ack_chan->outstanding_msgs, QOS_ACK_MAX_OUTSTANDING_MSGS,
                     orte_event_base, eviction_timeout, 0,
                     orte_qos_ack_recv_msg_timeout_callback);
    OPAL_OUTPUT_VERBOSE((1, orte_qos_base_framework.framework_output,
                         "%s ack_open channel = %p init hotel timeout =%d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (void*)ack_chan, eviction_timeout));
    opal_event_evtimer_set (orte_event_base, &ack_chan->msg_ack_timer_event,
                            orte_qos_ack_msg_window_timeout_callback, (void *) ack_chan);
    return rc;
}
/*
 * Construct/destruct an endpoint structure.
 */
static void endpoint_construct(mca_btl_base_endpoint_t* endpoint)
{
    int i;

    endpoint->endpoint_module = NULL;
    endpoint->endpoint_proc = NULL;
    endpoint->endpoint_proc_index = -1;
    endpoint->endpoint_exiting = false;

    for (i=0; i<USNIC_NUM_CHANNELS; ++i) {
        endpoint->endpoint_remote_addr.qp_num[i] = 0;
    }
    endpoint->endpoint_remote_addr.gid.global.subnet_prefix = 0;
    endpoint->endpoint_remote_addr.gid.global.interface_id = 0;
    endpoint->endpoint_remote_ah = NULL;

    endpoint->endpoint_send_credits = 8;

    /* list of fragments queued to be sent */
    OBJ_CONSTRUCT(&endpoint->endpoint_frag_send_queue, opal_list_t);

    endpoint->endpoint_next_frag_id = 1;
    endpoint->endpoint_acktime = 0;

    endpoint->endpoint_rfstart = endpoint->endpoint_next_contig_seq_to_recv;

    /* endpoint starts not-ready-to-send */
    endpoint->endpoint_ready_to_send = 0;
    endpoint->endpoint_ack_needed = false;

    /* clear sent/received sequence number array */
    memset(endpoint->endpoint_sent_segs, 0,
            sizeof(endpoint->endpoint_sent_segs));
    memset(endpoint->endpoint_rcvd_segs, 0,
            sizeof(endpoint->endpoint_rcvd_segs));

    /*
     * Make a new OPAL hotel for this module
     * "hotel" is a construct used for triggering segment retransmission
     * due to timeout
     */
    OBJ_CONSTRUCT(&endpoint->endpoint_hotel, opal_hotel_t);
    opal_hotel_init(&endpoint->endpoint_hotel, 
                    WINDOW_SIZE,
                    mca_btl_usnic_component.retrans_timeout,
                    0,
                    ompi_btl_usnic_ack_timeout);

    /* Setup this endpoint's list links */
    OBJ_CONSTRUCT(&(endpoint->endpoint_ack_li), opal_list_item_t);
    OBJ_CONSTRUCT(&(endpoint->endpoint_endpoint_li), opal_list_item_t);
    endpoint->endpoint_ack_needed = false;

    /* fragment reassembly info */
    endpoint->endpoint_rx_frag_info =
        calloc(sizeof(struct ompi_btl_usnic_rx_frag_info_t), MAX_ACTIVE_FRAGS);
    assert(NULL != endpoint->endpoint_rx_frag_info);
    if (OPAL_UNLIKELY(endpoint->endpoint_rx_frag_info == NULL)) {
        BTL_ERROR(("calloc returned NULL -- this should not happen!"));
        ompi_btl_usnic_exit();
        /* Does not return */
    }
}
Esempio n. 4
0
/*
 * Initialize global variables used w/in the server.
 */
int pmix_server_init(void)
{
    int rc;
    opal_list_t info;
    opal_value_t *kv;

    if (orte_pmix_server_globals.initialized) {
        return ORTE_SUCCESS;
    }
    orte_pmix_server_globals.initialized = true;

    /* setup the server's state variables */
    OBJ_CONSTRUCT(&orte_pmix_server_globals.reqs, opal_hotel_t);
    if (OPAL_SUCCESS != (rc = opal_hotel_init(&orte_pmix_server_globals.reqs,
                                              orte_pmix_server_globals.num_rooms,
                                              orte_event_base, orte_pmix_server_globals.timeout*1000000,
                                              ORTE_ERROR_PRI, eviction_cbfunc))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    OBJ_CONSTRUCT(&orte_pmix_server_globals.notifications, opal_list_t);

   /* setup recv for direct modex requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX,
                            ORTE_RML_PERSISTENT, pmix_server_dmdx_recv, NULL);

    /* setup recv for replies to direct modex requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DIRECT_MODEX_RESP,
                            ORTE_RML_PERSISTENT, pmix_server_dmdx_resp, NULL);

    /* setup recv for replies to proxy launch requests */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_LAUNCH_RESP,
                            ORTE_RML_PERSISTENT, pmix_server_launch_resp, NULL);

    /* setup recv for replies from data server */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT,
                            ORTE_RML_PERSISTENT, pmix_server_keyval_client, NULL);

    /* setup recv for notifications */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_NOTIFICATION,
                            ORTE_RML_PERSISTENT, pmix_server_notify, NULL);

    /* ensure the PMIx server uses the proper rendezvous directory */
    opal_setenv("PMIX_SERVER_TMPDIR", orte_process_info.proc_session_dir, true, &environ);

    /* pass the server the local topology - we do this so the procs won't read the
     * topology themselves as this could overwhelm the local
     * system on large-scale SMPs */
    OBJ_CONSTRUCT(&info, opal_list_t);
    if (NULL != opal_hwloc_topology) {
        char *xmlbuffer=NULL;
        int len;
        kv = OBJ_NEW(opal_value_t);
        kv->key = strdup(OPAL_PMIX_LOCAL_TOPO);
        if (0 != hwloc_topology_export_xmlbuffer(opal_hwloc_topology, &xmlbuffer, &len)) {
            OBJ_RELEASE(kv);
            OBJ_DESTRUCT(&info);
            return ORTE_ERROR;
        }
        kv->data.string = xmlbuffer;
        kv->type = OPAL_STRING;
        opal_list_append(&info, &kv->super);
    }
    /* tell the server to allow tool connections */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_TOOL_SUPPORT);
    kv->type = OPAL_BOOL;
    kv->data.flag = true;
    opal_list_append(&info, &kv->super);
    /* tell the server our temp directory */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SERVER_TMPDIR);
    kv->type = OPAL_STRING;
    kv->data.string = opal_os_path(false, orte_process_info.jobfam_session_dir, NULL);
    opal_list_append(&info, &kv->super);
    /* use the same for the system temp directory - this is
     * where the system-level tool connections will go */
    kv = OBJ_NEW(opal_value_t);
    kv->key = strdup(OPAL_PMIX_SYSTEM_TMPDIR);
    kv->type = OPAL_STRING;
    kv->data.string = strdup(orte_process_info.tmpdir_base);
    opal_list_append(&info, &kv->super);

    /* setup the local server */
    if (ORTE_SUCCESS != (rc = opal_pmix.server_init(&pmix_server, &info))) {
        /* pmix will provide a nice show_help output here */
        return rc;
    }
    OPAL_LIST_DESTRUCT(&info);

    /* if the universal server wasn't specified, then we use
     * our own HNP for that purpose */
    if (NULL == orte_pmix_server_globals.server_uri) {
        orte_pmix_server_globals.server = *ORTE_PROC_MY_HNP;
    } else {
        char *server;
        opal_buffer_t buf;
        if (0 == strncmp(orte_pmix_server_globals.server_uri, "file", strlen("file")) ||
            0 == strncmp(orte_pmix_server_globals.server_uri, "FILE", strlen("FILE"))) {
            char input[1024], *filename;
            FILE *fp;

            /* it is a file - get the filename */
            filename = strchr(orte_pmix_server_globals.server_uri, ':');
            if (NULL == filename) {
                /* filename is not correctly formatted */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-bad", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }
            ++filename; /* space past the : */

            if (0 >= strlen(filename)) {
                /* they forgot to give us the name! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-missing", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }

            /* open the file and extract the uri */
            fp = fopen(filename, "r");
            if (NULL == fp) { /* can't find or read file! */
                orte_show_help("help-orterun.txt", "orterun:ompi-server-filename-access", true,
                               orte_basename, orte_pmix_server_globals.server_uri);
                return ORTE_ERR_BAD_PARAM;
            }
            if (NULL == fgets(input, 1024, fp)) {
                /* something malformed about file */
                fclose(fp);
                orte_show_help("help-orterun.txt", "orterun:ompi-server-file-bad", true,
                               orte_basename, orte_pmix_server_globals.server_uri,
                               orte_basename);
                return ORTE_ERR_BAD_PARAM;
            }
            fclose(fp);
            input[strlen(input)-1] = '\0';  /* remove newline */
            server = strdup(input);
        } else {
            server = strdup(orte_pmix_server_globals.server_uri);
        }
        /* setup our route to the server */
        OBJ_CONSTRUCT(&buf, opal_buffer_t);
        opal_dss.pack(&buf, &server, 1, OPAL_STRING);
        if (ORTE_SUCCESS != (rc = orte_rml_base_update_contact_info(&buf))) {
            ORTE_ERROR_LOG(rc);
            ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
            return rc;
        }
        OBJ_DESTRUCT(&buf);
        /* parse the URI to get the server's name */
        if (ORTE_SUCCESS != (rc = orte_rml_base_parse_uris(server, &orte_pmix_server_globals.server, NULL))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
        /* check if we are to wait for the server to start - resolves
         * a race condition that can occur when the server is run
         * as a background job - e.g., in scripts
         */
        if (orte_pmix_server_globals.wait_for_server) {
            /* ping the server */
            struct timeval timeout;
            timeout.tv_sec = orte_pmix_server_globals.timeout;
            timeout.tv_usec = 0;
            if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
                /* try it one more time */
                if (ORTE_SUCCESS != (rc = orte_rml.ping(server, &timeout))) {
                    /* okay give up */
                    orte_show_help("help-orterun.txt", "orterun:server-not-found", true,
                                   orte_basename, server,
                                   (long)orte_pmix_server_globals.timeout,
                                   ORTE_ERROR_NAME(rc));
                    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    return rc;
                }
            }
        }
    }

    return rc;
}