Ejemplo n.º 1
0
EXPORT int _GetString( char *s )
{
	return tm_getline((UB*)s);
}
Ejemplo n.º 2
0
static int discover(opal_list_t* nodelist, char *pbs_jobid)
{
    int32_t nodeid;
    orte_node_t *node;
    opal_list_item_t* item;
    FILE *fp;
    char *hostname;
    struct timeval start, stop;

    /* check for timing request - get start time if so */
    if (orte_timing) {
        gettimeofday(&start, NULL);
    }
    
    /* Ignore anything that the user already specified -- we're
       getting nodes only from TM. */

    /* TM "nodes" may actually correspond to PBS "VCPUs", which means
       there may be multiple "TM nodes" that correspond to the same
       physical node.  This doesn't really affect what we're doing
       here (we actually ignore the fact that they're duplicates --
       slightly inefficient, but no big deal); just mentioned for
       completeness... */

    /* setup the full path to the PBS file */
    filename = opal_os_path(false, mca_ras_tm_component.nodefile_dir,
                            pbs_jobid, NULL);
    fp = fopen(filename, "r");
    if (NULL == fp) {
        ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
        free(filename);
        return ORTE_ERR_FILE_OPEN_FAILURE;
    }

    /* Iterate through all the nodes and make an entry for each.  TM
       node ID's will never be duplicated, but they may end up
       resolving to the same hostname (i.e., vcpu's on a single
       host). */

    nodeid=0;
    while (NULL != (hostname = tm_getline(fp))) {

        OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                             "%s ras:tm:allocate:discover: got hostname %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));

        /* Remember that TM may list the same node more than once.  So
           we have to check for duplicates. */

        for (item = opal_list_get_first(nodelist);
             opal_list_get_end(nodelist) != item;
             item = opal_list_get_next(item)) {
            node = (orte_node_t*) item;
            if (0 == strcmp(node->name, hostname)) {
                ++node->slots;

                OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                                     "%s ras:tm:allocate:discover: found -- bumped slots to %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));
                
                break;
            }
        }

        /* Did we find it? */

        if (opal_list_get_end(nodelist) == item) {

            /* Nope -- didn't find it, so add a new item to the list */
            
            OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                                 "%s ras:tm:allocate:discover: not found -- added to list",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
            
            node = OBJ_NEW(orte_node_t);
            node->name = hostname;
            node->launch_id = nodeid;
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = 1;
            opal_list_append(nodelist, &node->super);
        } else {

            /* Yes, so we need to free the hostname that came back */
            free(hostname);
        }

        /* up the nodeid */
        nodeid++;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (orte_timing) {
        gettimeofday(&stop, NULL);
        opal_output(0, "ras_tm: time to allocate is %ld usec",
                    (long int)((stop.tv_sec - start.tv_sec)*1000000 +
                               (stop.tv_usec - start.tv_usec)));
        gettimeofday(&start, NULL);
    }
    
    return ORTE_SUCCESS;
}
Ejemplo n.º 3
0
static int discover(opal_list_t* nodelist, char *pbs_jobid)
{
    int32_t nodeid;
    orte_node_t *node;
    opal_list_item_t* item;
    FILE *fp;
    char *hostname, *cppn;
    int ppn;
    char *ptr;

    /* Ignore anything that the user already specified -- we're
       getting nodes only from TM. */

    /* TM "nodes" may actually correspond to PBS "VCPUs", which means
       there may be multiple "TM nodes" that correspond to the same
       physical node.  This doesn't really affect what we're doing
       here (we actually ignore the fact that they're duplicates --
       slightly inefficient, but no big deal); just mentioned for
       completeness... */

    /* if we are in SMP mode, then read the environment to get the
     * number of cpus for each node read in the file
     */
    if (mca_ras_tm_component.smp_mode) {
        if (NULL == (cppn = getenv("PBS_PPN"))) {
            orte_show_help("help-ras-tm.txt", "smp-error", true);
            return ORTE_ERR_NOT_FOUND;
        }
        ppn = strtol(cppn, NULL, 10);
    } else {
        ppn = 1;
    }

    /* setup the full path to the PBS file */
    filename = opal_os_path(false, mca_ras_tm_component.nodefile_dir,
                            pbs_jobid, NULL);
    fp = fopen(filename, "r");
    if (NULL == fp) {
        ORTE_ERROR_LOG(ORTE_ERR_FILE_OPEN_FAILURE);
        free(filename);
        return ORTE_ERR_FILE_OPEN_FAILURE;
    }

    /* Iterate through all the nodes and make an entry for each.  TM
       node ID's will never be duplicated, but they may end up
       resolving to the same hostname (i.e., vcpu's on a single
       host). */

    nodeid=0;
    while (NULL != (hostname = tm_getline(fp))) {
        if( !orte_keep_fqdn_hostnames && !opal_net_isaddr(hostname) ) {
            if (NULL != (ptr = strchr(hostname, '.'))) {
                *ptr = '\0';
            }
        }

        OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                             "%s ras:tm:allocate:discover: got hostname %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));

        /* Remember that TM may list the same node more than once.  So
           we have to check for duplicates. */

        for (item = opal_list_get_first(nodelist);
             opal_list_get_end(nodelist) != item;
             item = opal_list_get_next(item)) {
            node = (orte_node_t*) item;
            if (0 == strcmp(node->name, hostname)) {
                if (mca_ras_tm_component.smp_mode) {
                    /* this cannot happen in smp mode */
                    orte_show_help("help-ras-tm.txt", "smp-multi", true);
                    return ORTE_ERR_BAD_PARAM;
                }
                ++node->slots;

                OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                                     "%s ras:tm:allocate:discover: found -- bumped slots to %d",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), node->slots));

                break;
            }
        }

        /* Did we find it? */

        if (opal_list_get_end(nodelist) == item) {

            /* Nope -- didn't find it, so add a new item to the list */

            OPAL_OUTPUT_VERBOSE((1, orte_ras_base_framework.framework_output,
                                 "%s ras:tm:allocate:discover: not found -- added to list",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

            node = OBJ_NEW(orte_node_t);
            node->name = hostname;
            orte_set_attribute(&node->attributes, ORTE_NODE_LAUNCH_ID, ORTE_ATTR_LOCAL, &nodeid, OPAL_INT32);
            node->slots_inuse = 0;
            node->slots_max = 0;
            node->slots = ppn;
            node->state = ORTE_NODE_STATE_UP;
            opal_list_append(nodelist, &node->super);
        } else {

            /* Yes, so we need to free the hostname that came back */
            free(hostname);
        }

        /* up the nodeid */
        nodeid++;
    }
    fclose(fp);

    return ORTE_SUCCESS;
}