static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
    orte_pmap_t *pmap;
    
    /* is this me? */
    if (proc->jobid == ORTE_PROC_MY_NAME->jobid &&
        proc->vpid == ORTE_PROC_MY_NAME->vpid) {
        /* yes it is - reply with my rank. This is necessary
         * because the pidmap will not have arrived when I
         * am starting up, and if we use static ports, then
         * I need to know my node rank during init
         */
        return my_node_rank;
    }
    
    if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
        return ORTE_NODE_RANK_INVALID;
    }    
    
    OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                         "%s ess:slurm: proc %s has node rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         (int)pmap->node_rank));
    
    return pmap->node_rank;
}
Example #2
0
static orte_node_rank_t proc_get_node_rank(orte_process_name_t *proc)
{
    orte_pmap_t *pmap;
    
    if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
        return ORTE_NODE_RANK_INVALID;
    }    
    
    OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                         "%s ess:generic: proc %s has node rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         (int)pmap->node_rank));
    
    return pmap->node_rank;
}
static orte_local_rank_t proc_get_local_rank(orte_process_name_t *proc)
{
    orte_pmap_t *pmap;
    
    if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        return ORTE_LOCAL_RANK_INVALID;
    }    
    
    OPAL_OUTPUT_VERBOSE((2, orte_ess_base_output,
                         "%s ess:slurm: proc %s has local rank %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc),
                         (int)pmap->local_rank));
    
    return pmap->local_rank;
}
Example #4
0
orte_nid_t* orte_util_lookup_nid(orte_process_name_t *proc)
{
    orte_pmap_t *pmap;
    
    OPAL_OUTPUT_VERBOSE((5, orte_debug_output,
                         "%s lookup:nid: looking for proc %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_NAME_PRINT(proc)));
    
    if (ORTE_JOBID_IS_DAEMON(proc->jobid)) {
        /* looking for a daemon */
        return find_daemon_node(proc);
    }
    
    /* looking for an application proc */
    if (NULL == (pmap = orte_util_lookup_pmap(proc))) {
        return NULL;
    }
    
    /* the get_item function will check the array index range,
     * so we can just access it here
     */
    return (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, pmap->node);
}
/***************  MODEX SECTION **************/
int orte_grpcomm_base_full_modex(opal_list_t *procs, bool modex_db)
{
    opal_buffer_t buf, rbuf;
    int32_t i, n, num_procs;
    orte_std_cntr_t cnt, j, num_recvd_entries;
    orte_process_name_t proc_name;
    int rc=ORTE_SUCCESS;
    bool modex_reqd;
    orte_nid_t *nid;
    orte_local_rank_t local_rank;
    orte_node_rank_t node_rank;
    orte_jmap_t *jmap;
    orte_pmap_t *pmap;
    orte_vpid_t daemon;
    char *hostname;
    
    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_output,
                         "%s grpcomm:base:full:modex: performing modex",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* setup the buffer that will actually be sent */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    OBJ_CONSTRUCT(&rbuf, opal_buffer_t);
    
    /* put our process name in the buffer so it can be unpacked later */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* pack our hostname */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* pack our daemon's vpid */
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &ORTE_PROC_MY_DAEMON->vpid, 1, ORTE_VPID))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* pack our node rank */
    node_rank = orte_ess.get_node_rank(ORTE_PROC_MY_NAME);
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &node_rank, 1, ORTE_NODE_RANK))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* pack our local rank */
    local_rank = orte_ess.get_local_rank(ORTE_PROC_MY_NAME);
    if (ORTE_SUCCESS != (rc = opal_dss.pack(&buf, &local_rank, 1, ORTE_LOCAL_RANK))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* pack the entries we have received */
    if (ORTE_SUCCESS != (rc = orte_grpcomm_base_pack_modex_entries(&buf, &modex_reqd))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:base:full:modex: executing allgather",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    /* exchange the buffer with the list of peers */
    if (ORTE_SUCCESS != (rc = orte_grpcomm.allgather_list(procs, &buf, &rbuf))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((2, orte_grpcomm_base_output,
                         "%s grpcomm:base:full:modex: processing modex info",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    


    /* extract the number of procs that put data in the buffer */
    cnt=1;
    if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_procs, &cnt, OPAL_INT32))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                         "%s grpcomm:base:full:modex: received %ld data bytes from %d procs",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (long)(rbuf.pack_ptr - rbuf.unpack_ptr), num_procs));
    
    /* if the buffer doesn't have any more data, ignore it */
    if (0 >= (rbuf.pack_ptr - rbuf.unpack_ptr)) {
        goto cleanup;
    }
    
    /* otherwise, process it */
    for (i=0; i < num_procs; i++) {
        /* unpack the process name */
        cnt=1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &proc_name, &cnt, ORTE_NAME))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack the hostname */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &hostname, &cnt, OPAL_STRING))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack the daemon vpid */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &daemon, &cnt, ORTE_VPID))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack the node rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &node_rank, &cnt, ORTE_NODE_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* unpack the local rank */
        cnt = 1;
        if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &local_rank, &cnt, ORTE_LOCAL_RANK))) {
            ORTE_ERROR_LOG(rc);
            goto cleanup;
        }
        
        /* UPDATE THE NIDMAP/PIDMAP TO SUPPORT DYNAMIC OPERATIONS */
        
        /* find this proc's node in the nidmap */
        nid = NULL;
        for (n=0; NULL != (nid = (orte_nid_t *) opal_pointer_array_get_item(&orte_nidmap, n)); n++) {
            if (0 == strcmp(hostname, nid->name)) {
                break;
            }
        }
        if (NULL == nid) {
            /* node wasn't found - let's add it */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                                 "%s grpcomm:base:full:modex no nidmap entry for node %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), hostname));
            nid = OBJ_NEW(orte_nid_t);
            nid->name = strdup(hostname);
            nid->daemon = daemon;
            nid->index = opal_pointer_array_add(&orte_nidmap, nid);
        }
        
        /* see if we have this job in a jobmap */
        if (NULL == (jmap = orte_util_lookup_jmap(proc_name.jobid))) {
            /* proc wasn't found - let's add it */
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                                 "%s grpcomm:base:full:modex no jobmap entry for job %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(proc_name.jobid)));
            jmap = OBJ_NEW(orte_jmap_t);
            jmap->job = proc_name.jobid;
            /* unfortunately, job objects cannot be stored
             * by index number as the jobid is a constructed
             * value. So we have to just add it to the end
             * of the array
             */
            opal_pointer_array_add(&orte_jobmap, jmap);
            jmap->num_procs = 1;
            /* have to add the pidmap entry too, but this
             * can be done at the specific site corresponding
             * to the proc's vpid
             */
            pmap = OBJ_NEW(orte_pmap_t);
            pmap->node = nid->index;
            pmap->local_rank = local_rank;
            pmap->node_rank = node_rank;
            opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap);
        } else {
            /* see if we have this proc in a pidmap */
            if (NULL == orte_util_lookup_pmap(&proc_name)) {
                /* proc wasn't found - let's add it */
                OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                                     "%s grpcomm:base:full:modex no pidmap entry for proc %s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&proc_name)));
                pmap = OBJ_NEW(orte_pmap_t);
                pmap->node = nid->index;
                pmap->local_rank = local_rank;
                pmap->node_rank = node_rank;
                /* this can be done at the specific site corresponding
                 * to the proc's vpid
                 */
                opal_pointer_array_set_item(&jmap->pmap, proc_name.vpid, pmap);
                /* account for the proc entry in the jmap */
                jmap->num_procs++;
            }
        }
        
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                             "%s grpcomm:base:full:modex: adding modex entry for proc %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_NAME_PRINT(&proc_name)));
        
        /* UPDATE THE MODEX INFO FOR THIS PROC */
        
        if (modex_db) {
            /* update the modex database */
            if (ORTE_SUCCESS != (rc = orte_grpcomm_base_update_modex_entries(&proc_name, &rbuf))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }            
        } else {
            /* unpack the number of entries for this proc */
            cnt=1;
            if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_recvd_entries, &cnt, ORTE_STD_CNTR))) {
                ORTE_ERROR_LOG(rc);
                goto cleanup;
            }
            
            OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_output,
                                 "%s grpcomm:base:full:modex adding %d entries for proc %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), num_recvd_entries,
                                 ORTE_NAME_PRINT(&proc_name)));
            
            /*
             * Extract the attribute names and values
             */
            for (j = 0; j < num_recvd_entries; j++) {
                size_t num_bytes;
                orte_attr_t *attr;
                
                attr = OBJ_NEW(orte_attr_t);
                cnt = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &(attr->name), &cnt, OPAL_STRING))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }
                
                cnt = 1;
                if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, &num_bytes, &cnt, OPAL_SIZE))) {
                    ORTE_ERROR_LOG(rc);
                    goto cleanup;
                }
                attr->size = num_bytes;
                
                if (num_bytes != 0) {
                    if (NULL == (attr->bytes = (uint8_t *) malloc(num_bytes))) {
                        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
                        rc = ORTE_ERR_OUT_OF_RESOURCE;
                        goto cleanup;
                    }
                    cnt = (orte_std_cntr_t) num_bytes;
                    if (ORTE_SUCCESS != (rc = opal_dss.unpack(&rbuf, attr->bytes, &cnt, OPAL_BYTE))) {
                        ORTE_ERROR_LOG(rc);
                        goto cleanup;
                    }
                }
                
                /* add this to the node's attribute list */
                opal_list_append(&nid->attrs, &attr->super);
            }
        }
    }
    
cleanup:
    OBJ_DESTRUCT(&buf);
    OBJ_DESTRUCT(&rbuf);
    return rc;
}