コード例 #1
0
ファイル: osc_pt2pt_replyreq.c プロジェクト: aosm/openmpi
static void ompi_osc_pt2pt_replyreq_construct(ompi_osc_pt2pt_replyreq_t *replyreq)
{
    OBJ_CONSTRUCT(&(replyreq->rep_target_convertor), ompi_convertor_t);
}
コード例 #2
0
int mca_btl_ud_component_open(void)
{
    int val;
    
    /* initialize state */
    mca_btl_ofud_component.num_btls = 0;
    mca_btl_ofud_component.ud_btls = NULL;
    
    /* initialize objects */
    OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_lock, opal_mutex_t);
    OBJ_CONSTRUCT(&mca_btl_ofud_component.ud_procs, opal_list_t);

    /* register IB component parameters */
    mca_btl_ud_param_reg_int("max_btls",
                             "Maximum number of HCAs/ports to use",
                             4, (int*)&mca_btl_ofud_component.max_btls);

    mca_btl_ud_param_reg_string("mpool", "Name of the memory pool to be used",
                                "rdma", &mca_btl_ofud_component.ud_mpool_name);

    mca_btl_ud_param_reg_int("ib_pkey_index", "IB pkey index",
                             0, (int*)&mca_btl_ofud_component.ib_pkey_ix);
    mca_btl_ud_param_reg_int("ib_qkey", "IB qkey",
                             0x01330133, (int*)&mca_btl_ofud_component.ib_qkey);
    mca_btl_ud_param_reg_int("ib_service_level", "IB service level",
                             0, (int*)&mca_btl_ofud_component.ib_service_level);
    mca_btl_ud_param_reg_int("ib_src_path_bits", "IB source path bits",
                             0, (int*)&mca_btl_ofud_component.ib_src_path_bits);

    mca_btl_ud_param_reg_int("sd_num", "maximum send descriptors to post",
                             128, (int*)&mca_btl_ofud_component.sd_num);

    mca_btl_ud_param_reg_int("rd_num", "number of receive buffers",
                             6000, (int*)&mca_btl_ofud_component.rd_num);
#if 0
    mca_btl_ud_param_reg_int("rd_num_init", "initial receive buffers",
                             3000, (int*)&mca_btl_ofud_component.rd_num_init);
    mca_btl_ud_param_reg_int("rd_num_max", "maximum receive buffers",
                             4500, (int*)&mca_btl_ofud_component.rd_num_max);
    mca_btl_ud_param_reg_int("rd_num_inc",
                             "number of buffers to post when rate is high",
                             25, (int*)&mca_btl_ofud_component.rd_num_inc);
#endif

    /* TODO - this assumes a 2k UD MTU - query/do something more intelligent */
    /*mca_btl_ud_param_reg_int("eager_limit", "eager send limit",
                             2048, &val); */
    mca_btl_ud_param_reg_int("min_send_size", "minimum send size",
                             2048, &val);
    mca_btl_ofud_module.super.btl_rndv_eager_limit = val;
    mca_btl_ud_param_reg_int("max_send_size", "maximum send size",
                             2048, &val);
    mca_btl_ofud_module.super.btl_eager_limit = val;
    mca_btl_ofud_module.super.btl_max_send_size = val;

    mca_btl_ud_param_reg_int("exclusivity", "BTL exclusivity",
                             MCA_BTL_EXCLUSIVITY_DEFAULT,
                             (int*)&mca_btl_ofud_module.super.btl_exclusivity);
    mca_btl_ud_param_reg_int("bandwidth",
                             "Approximate maximum bandwidth of interconnect",
                             800, (int*)&mca_btl_ofud_module.super.btl_bandwidth);
    
    mca_btl_ofud_module.super.btl_eager_limit -= sizeof(mca_btl_ud_header_t);
    mca_btl_ofud_module.super.btl_max_send_size -= sizeof(mca_btl_ud_header_t);

    return OMPI_SUCCESS;
}
コード例 #3
0
static void xcast_recv(int status, orte_process_name_t* sender,
                       opal_buffer_t* buffer, orte_rml_tag_t tg,
                       void* cbdata)
{
    opal_list_item_t *item;
    orte_namelist_t *nm;
    int ret, cnt;
    opal_buffer_t *relay, *rly;
    orte_daemon_cmd_flag_t command = ORTE_DAEMON_NULL_CMD;
    opal_buffer_t wireup;
    opal_byte_object_t *bo;
    int8_t flag;
    orte_job_t *jdata;
    orte_proc_t *rec;
    opal_list_t coll;
    orte_grpcomm_signature_t *sig;
    orte_rml_tag_t tag;
    char *rtmod;

    OPAL_OUTPUT_VERBOSE((1, orte_grpcomm_base_framework.framework_output,
                         "%s grpcomm:direct:xcast:recv: with %d bytes",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         (int)buffer->bytes_used));

    /* we need a passthru buffer to send to our children */
    rly = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(rly, buffer);

    /* get the signature that we do not need */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }
    OBJ_RELEASE(sig);

    /* get the target tag */
    cnt=1;
    if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &tag, &cnt, ORTE_RML_TAG))) {
        ORTE_ERROR_LOG(ret);
        ORTE_FORCED_TERMINATE(ret);
        return;
    }

    /* setup a buffer we can pass to ourselves - this just contains
     * the initial message, minus the headers inserted by xcast itself */
    relay = OBJ_NEW(opal_buffer_t);
    opal_dss.copy_payload(relay, buffer);
    /* setup the relay list */
    OBJ_CONSTRUCT(&coll, opal_list_t);

    /* get our conduit's routed module name */
    rtmod = orte_rml.get_routed(orte_coll_conduit);

    /* if this is headed for the daemon command processor,
     * then we first need to check for add_local_procs
     * as that command includes some needed wireup info */
    if (ORTE_RML_TAG_DAEMON == tag) {
        /* peek at the command */
        cnt=1;
        if (ORTE_SUCCESS == (ret = opal_dss.unpack(buffer, &command, &cnt, ORTE_DAEMON_CMD))) {
            /* if it is add_procs, then... */
            if (ORTE_DAEMON_ADD_LOCAL_PROCS == command ||
                ORTE_DAEMON_DVM_NIDMAP_CMD == command) {
                /* extract the byte object holding the daemonmap */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }

                /* update our local nidmap, if required - the decode function
                 * knows what to do - it will also free the bytes in the byte object
                 */
                if (ORTE_PROC_IS_HNP) {
                    /* no need - already have the info */
                    if (NULL != bo) {
                        if (NULL != bo->bytes) {
                            free(bo->bytes);
                        }
                        free(bo);
                    }
                } else {
                    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                                         "%s grpcomm:direct:xcast updating daemon nidmap",
                                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

                    if (ORTE_SUCCESS != (ret = orte_util_decode_daemon_nodemap(bo))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                }

                /* update the routing plan */
                orte_routed.update_routing_plan(rtmod);

                /* see if we have wiring info as well */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT8))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }

                if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
                    OBJ_RELEASE(relay);
                    relay = OBJ_NEW(opal_buffer_t);
                    /* repack the command */
                    if (OPAL_SUCCESS != (ret = opal_dss.pack(relay, &command, 1, ORTE_DAEMON_CMD))) {
                        ORTE_ERROR_LOG(ret);
                        goto relay;
                    }
                    if (0 == flag) {
                        /* copy the remainder of the payload */
                        opal_dss.copy_payload(relay, buffer);
                        /* no - just return */
                        goto relay;
                    }
                }

                /* unpack the byte object */
                cnt=1;
                if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &bo, &cnt, OPAL_BYTE_OBJECT))) {
                    ORTE_ERROR_LOG(ret);
                    goto relay;
                }
                if (0 < bo->size) {
                    /* load it into a buffer */
                    OBJ_CONSTRUCT(&wireup, opal_buffer_t);
                    opal_dss.load(&wireup, bo->bytes, bo->size);
                    /* pass it for processing */
                    if (ORTE_SUCCESS != (ret = orte_rml_base_update_contact_info(&wireup))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_DESTRUCT(&wireup);
                        goto relay;
                    }
                    /* done with the wireup buffer - dump it */
                    OBJ_DESTRUCT(&wireup);
                }
                free(bo);
                if (ORTE_DAEMON_ADD_LOCAL_PROCS == command) {
                    /* copy the remainder of the payload */
                    opal_dss.copy_payload(relay, buffer);
                }
            }
        } else {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }
    }

 relay:

    /* get the list of next recipients from the routed module */
    orte_routed.get_routing_list(rtmod, &coll);

    /* if list is empty, no relay is required */
    if (opal_list_is_empty(&coll)) {
        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:direct:send_relay - recipient list is empty!",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        OBJ_RELEASE(rly);
        goto CLEANUP;
    }

    /* send the message to each recipient on list, deconstructing it as we go */
    while (NULL != (item = opal_list_remove_first(&coll))) {
        nm = (orte_namelist_t*)item;

        OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:direct:send_relay sending relay msg of %d bytes to %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (int)rly->bytes_used,
                             ORTE_NAME_PRINT(&nm->name)));
        OBJ_RETAIN(rly);
        /* check the state of the recipient - no point
         * sending to someone not alive
         */
        jdata = orte_get_job_data_object(nm->name.jobid);
        if (NULL == (rec = (orte_proc_t*)opal_pointer_array_get_item(jdata->procs, nm->name.vpid))) {
            opal_output(0, "%s grpcomm:direct:send_relay proc %s not found - cannot relay",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        if (ORTE_PROC_STATE_RUNNING < rec->state ||
            !ORTE_FLAG_TEST(rec, ORTE_PROC_FLAG_ALIVE)) {
            opal_output(0, "%s grpcomm:direct:send_relay proc %s not running - cannot relay",
                        ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), ORTE_NAME_PRINT(&nm->name));
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
                                                           &nm->name, rly, ORTE_RML_TAG_XCAST,
                                                           orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(rly);
            OBJ_RELEASE(item);
            continue;
        }
        OBJ_RELEASE(item);
    }
    OBJ_RELEASE(rly);  // retain accounting

 CLEANUP:
    /* cleanup */
    OBJ_DESTRUCT(&coll);

    /* now send the relay buffer to myself for processing */
    if (ORTE_DAEMON_DVM_NIDMAP_CMD != command) {
        if (ORTE_SUCCESS != (ret = orte_rml.send_buffer_nb(orte_coll_conduit,
                                                           ORTE_PROC_MY_NAME, relay, tag,
                                                           orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(relay);
        }
    }
}
コード例 #4
0
ファイル: pubsub_orte.c プロジェクト: XuanWang1982/ompi
static char* lookup ( const char *service_name, ompi_info_t *info )
{
    orte_process_name_t *info_host;
    opal_buffer_t *buf;
    orte_data_server_cmd_t cmd=ORTE_DATA_SERVER_LOOKUP;
    orte_std_cntr_t cnt=0;
    char *port_name=NULL;
    int ret, rc, flag, i;
    char value[256], **tokens, *ptr;
    int lookup[2] = { GLOBAL, LOCAL };
    size_t num_tokens;
    orte_rml_recv_cb_t xfer;

    /* Look in the MPI_Info (ompi_info_t*) for the key
     * "ompi_lookup_order".  Acceptable values are:
     *
     * - "local" -- only check the local scope
     * - "global" -- only check the global scope
     * - "local,global" -- check the local scope first, then check the
     *   global scope
     * - "global,local" -- check the global scope first, then check the
     *   local scope
     *
     * Give a little leeway in terms of whitespace in the value.
     *
     * The lookup[2] array will contain the results: lookup[0] is the
     * first scope to check, lookup[1] is the 2nd.  Either value may
     * be NONE, LOCAL, or GLOBAL.  If both are NONE, clearly that's an
     * error.  :-)
     */
    ompi_info_get(info, "ompi_lookup_order", sizeof(value) - 1, value, &flag);
    if (flag) {
        ptr = &value[0];
        while (isspace(*ptr) && (ptr - value) < (int)sizeof(value)) {
            ++ptr;
        }
        if (ptr - value < (int)sizeof(value)) {
            tokens = opal_argv_split(ptr, ',');
            if (NULL != tokens) {
                if ((num_tokens = opal_argv_count(tokens)) > 2) {
                    /* too many values in the comma-delimited list */
                    opal_show_help("help-ompi-pubsub-orte.txt",
                                   "pubsub-orte:too-many-orders",
                                   true, (long)ORTE_PROC_MY_NAME->vpid,
                                   (long)num_tokens);
                    opal_argv_free(tokens);
                    return NULL;
                }
                for (i = 0; i < 2; ++i) {
                    if (NULL != tokens[i]) {
                        if (0 == strcasecmp(tokens[i], "local")) {
                            lookup[i] = LOCAL;
                        } else if (0 == strcasecmp(tokens[i], "global")) {
                            lookup[i] = GLOBAL;
                        } else {
                            /* unrecognized value -- that's an error */
                            opal_show_help("help-ompi-pubsub-orte.txt",
                                           "pubsub-orte:unknown-order",
                                           true, (long)ORTE_PROC_MY_NAME->vpid);
                            opal_argv_free(tokens);
                            return NULL;
                        }
                    } else {
                        lookup[i] = NONE;
                    }
                }
                opal_argv_free(tokens);
            }
        }

        if (NONE == lookup[0]) {
            /* if the user provided an info key, then we at least must
             * be given one place to look
             */
            opal_show_help("help-ompi-pubsub-orte.txt",
                           "pubsub-orte:unknown-order",
                           true, (long)ORTE_PROC_MY_NAME->vpid);
            return NULL;
        }

    } else {
        /* if no info key was provided, then we default to the global
         * server IF it is active
         */
        if (!server_setup) {
            setup_server();
        }
        lookup[1] = NONE;
        if (mca_pubsub_orte_component.server_found) {
            lookup[0] = GLOBAL;
        } else {
            /* global server was not found - just look local */
            lookup[0] = LOCAL;
        }
    }

    OPAL_OUTPUT_VERBOSE((1, ompi_pubsub_base_framework.framework_output,
                         "%s pubsub:orte: lookup service %s scope %d",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         service_name, lookup[0]));

    /* go find the value */
    for (i=0; i < 2; i++) {
        if (LOCAL == lookup[i]) {
            /* if the scope is local, then lookup the value on the HNP */
            info_host = ORTE_PROC_MY_HNP;
        } else if (GLOBAL == lookup[i]) {
            /* has the server been setup yet? */
            if (!server_setup) {
                setup_server();
            }
            /* lookup the value on the global ompi_server, but error
             * if that server wasn't contacted
             */
            if (!mca_pubsub_orte_component.server_found) {
                opal_show_help("help-ompi-pubsub-orte.txt",
                               "pubsub-orte:no-server",
                               true, (long)ORTE_PROC_MY_NAME->vpid,
                               "lookup from");
                return NULL;
            }
            info_host = &mca_pubsub_orte_component.server;
        } else if (NONE == lookup[i]) {
            continue;
        } else {
            /* unknown host! */
            opal_show_help("help-ompi-pubsub-orte.txt",
                           "pubsub-orte:unknown-order",
                           true, (long)ORTE_PROC_MY_NAME->vpid);
            return NULL;
        }

        /* go look it up */
        /* construct the buffer */
        buf = OBJ_NEW(opal_buffer_t);

        /* pack the lookup command */
        if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &cmd, 1, ORTE_DATA_SERVER_CMD))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buf);
            goto CLEANUP;
        }

        /* pack the service name */
        if (OPAL_SUCCESS != (ret = opal_dss.pack(buf, &service_name, 1, OPAL_STRING))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buf);
            goto CLEANUP;
        }

        /* send the cmd */
        if (0 > (ret = orte_rml.send_buffer_nb(info_host, buf,
                                               ORTE_RML_TAG_DATA_SERVER,
                                               orte_rml_send_callback, NULL))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buf);
            goto CLEANUP;
        }

        /* get the answer */
        OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
        xfer.active = true;
        orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD,
                                ORTE_RML_TAG_DATA_CLIENT,
                                ORTE_RML_NON_PERSISTENT,
                                orte_rml_recv_callback, &xfer);
        OMPI_WAIT_FOR_COMPLETION(xfer.active);

        /* unpack the return code */
        cnt = 1;
        if (OPAL_SUCCESS != (ret = opal_dss.unpack(&xfer.data, &rc, &cnt, OPAL_INT))) {
            ORTE_ERROR_LOG(ret);
            goto CLEANUP;
        }

        OPAL_OUTPUT_VERBOSE((1, ompi_pubsub_base_framework.framework_output,
                             "%s pubsub:orte: lookup returned status %d",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), rc));

        if (ORTE_SUCCESS == rc) {
            /* the server was able to lookup the port - unpack the port name */
            cnt=1;
            if (OPAL_SUCCESS != (ret = opal_dss.unpack(&xfer.data, &port_name, &cnt, OPAL_STRING))) {
                ORTE_ERROR_LOG(ret);
                OBJ_DESTRUCT(&xfer);
                goto CLEANUP;
            }

            OPAL_OUTPUT_VERBOSE((1, ompi_pubsub_base_framework.framework_output,
                                 "%s pubsub:orte: lookup returned port %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 (NULL == port_name) ? "NULL" : port_name));

            if (NULL != port_name) {
                /* got an answer - return it */
                OBJ_DESTRUCT(&xfer);
                return port_name;
            }
        }

        /* if we didn't get a port_name, then continue */
        OBJ_DESTRUCT(&xfer);
    }

    /* only get here if we tried both options and failed - since the
     * buffer will already have been cleaned up, just return
     */
 CLEANUP:
    return NULL;
}
コード例 #5
0
ファイル: regex.c プロジェクト: JulianKunkel/siox-gpfs-ompi
int orte_regex_create(char *nodelist, char **regexp)
{
    char *node;
    char prefix[ORTE_MAX_NODE_PREFIX];
    int i, j, len, startnum, nodenum, numdigits;
    bool found, fullname;
    char *suffix, *sfx;
    orte_regex_node_t *ndreg;
    orte_regex_range_t *range;
    opal_list_t nodeids;
    opal_list_item_t *item, *itm2;
    char **regexargs = NULL, *tmp, *tmp2;
    char *cptr;

    /* define the default */
    *regexp = NULL;

    cptr = strchr(nodelist, ',');
    if (NULL == cptr) {
        /* if there is only one node, don't bother */
        *regexp = strdup(nodelist);
        return ORTE_SUCCESS;
    }

    /* setup the list of results */
    OBJ_CONSTRUCT(&nodeids, opal_list_t);

    /* cycle thru the array of nodenames */
    node = nodelist;
    while (NULL != (cptr = strchr(node, ',')) || 0 < strlen(node)) {
        if (NULL != cptr) {
            *cptr = '\0';
        }
        /* determine this node's prefix by looking for first non-alpha char */
        fullname = false;
        len = strlen(node);
        startnum = -1;
        memset(prefix, 0, ORTE_MAX_NODE_PREFIX);
        numdigits = 0;
        for (i=0, j=0; i < len; i++) {
            if (!isalpha(node[i])) {
                /* found a non-alpha char */
                if (!isdigit(node[i])) {
                    /* if it is anything but a digit, we just use
                     * the entire name
                     */
                    fullname = true;
                    break;
                }
                /* count the size of the numeric field - but don't
                 * add the digits to the prefix
                 */
                numdigits++;
                if (startnum < 0) {
                    /* okay, this defines end of the prefix */
                    startnum = i;
                }
                continue;
            }
            if (startnum < 0) {
                prefix[j++] = node[i];
            }
        }
        if (fullname || startnum < 0) {
            /* can't compress this name - just add it to the list */
            ndreg = OBJ_NEW(orte_regex_node_t);
            ndreg->prefix = strdup(node);
            opal_list_append(&nodeids, &ndreg->super);
            /* move to the next posn */
            if (NULL == cptr) {
                break;
            }
            node = cptr + 1;
            continue;
        }
        /* convert the digits and get any suffix */
        nodenum = strtol(&node[startnum], &sfx, 10);
        if (NULL != sfx) {
            suffix = strdup(sfx);
        } else {
            suffix = NULL;
        }
        /* is this nodeid already on our list? */
        found = false;
        for (item = opal_list_get_first(&nodeids);
             !found && item != opal_list_get_end(&nodeids);
             item = opal_list_get_next(item)) {
            ndreg = (orte_regex_node_t*)item;
            if (0 < strlen(prefix) && NULL == ndreg->prefix) {
                continue;
            }
            if (0 == strlen(prefix) && NULL != ndreg->prefix) {
                continue;
            }
            if (0 < strlen(prefix) && NULL != ndreg->prefix
                && 0 != strcmp(prefix, ndreg->prefix)) {
                continue;
            }
            if (NULL == suffix && NULL != ndreg->suffix) {
                continue;
            }
            if (NULL != suffix && NULL == ndreg->suffix) {
                continue;
            }
            if (NULL != suffix && NULL != ndreg->suffix &&
                0 != strcmp(suffix, ndreg->suffix)) {
                continue;
            }
            if (numdigits != ndreg->num_digits) {
                continue;
            }
            /* found a match - flag it */
            found = true;
            /* get the last range on this nodeid - we do this
             * to preserve order
             */
            range = (orte_regex_range_t*)opal_list_get_last(&ndreg->ranges);
            if (NULL == range) {
                /* first range for this nodeid */
                range = OBJ_NEW(orte_regex_range_t);
                range->start = nodenum;
                range->cnt = 1;
                opal_list_append(&ndreg->ranges, &range->super);
                break;
            }
            /* see if the node number is out of sequence */
            if (nodenum != (range->start + range->cnt)) {
                /* start a new range */
                range = OBJ_NEW(orte_regex_range_t);
                range->start = nodenum;
                range->cnt = 1;
                opal_list_append(&ndreg->ranges, &range->super);
                break;
            }
            /* everything matches - just increment the cnt */
            range->cnt++;
            break;
        }
        if (!found) {
            /* need to add it */
            ndreg = OBJ_NEW(orte_regex_node_t);
            if (0 < strlen(prefix)) {
                ndreg->prefix = strdup(prefix);
            }
            if (NULL != suffix) {
                ndreg->suffix = strdup(suffix);
            }
            ndreg->num_digits = numdigits;
            opal_list_append(&nodeids, &ndreg->super);
            /* record the first range for this nodeid - we took
             * care of names we can't compress above
             */
            range = OBJ_NEW(orte_regex_range_t);
            range->start = nodenum;
            range->cnt = 1;
            opal_list_append(&ndreg->ranges, &range->super);
        }
        if (NULL != suffix) {
            free(suffix);
        }
        /* move to the next posn */
        if (NULL == cptr) {
            break;
        }
        node = cptr + 1;
    }

    /* begin constructing the regular expression */
    while (NULL != (item = opal_list_remove_first(&nodeids))) {
        ndreg = (orte_regex_node_t*)item;
        
        /* if no ranges, then just add the name */
        if (0 == opal_list_get_size(&ndreg->ranges)) {
            if (NULL != ndreg->prefix) {
                /* solitary node */
                asprintf(&tmp, "%s", ndreg->prefix);
                opal_argv_append_nosize(&regexargs, tmp);
                free(tmp);
            }
            OBJ_RELEASE(ndreg);
            continue;
        }
        /* if there is only one range, and it has only one node in it,
         * then we don't want to use bracket notation - so treat that
         * case separately
         */
        if (1 == opal_list_get_size(&ndreg->ranges)) {
            /* must be at least one */
            range = (orte_regex_range_t*)opal_list_get_first(&ndreg->ranges);
            /* if there is only one node in the range, then
             * just add its name
             */
            if (1 == range->cnt) {
                if (NULL != ndreg->suffix) {
                    if (NULL != ndreg->prefix) {
                        asprintf(&tmp, "%s%d%s", ndreg->prefix, range->start, ndreg->suffix);
                    } else {
                        asprintf(&tmp, "%d%s", range->start, ndreg->suffix);
                    }
                } else {
                    if (NULL != ndreg->prefix) {
                        asprintf(&tmp, "%s%d", ndreg->prefix, range->start);
                    } else {
                        asprintf(&tmp, "%d", range->start);
                    }
                }
                opal_argv_append_nosize(&regexargs, tmp);
                free(tmp);
                OBJ_RELEASE(ndreg);
                continue;
            }
        }
        /* start the regex for this nodeid with the prefix */
        if (NULL != ndreg->prefix) {
            asprintf(&tmp, "%s[%d:", ndreg->prefix, ndreg->num_digits);
        } else {
            asprintf(&tmp, "[%d:", ndreg->num_digits);
        }
        /* add the ranges */
        while (NULL != (itm2 = opal_list_remove_first(&ndreg->ranges))) {
            range = (orte_regex_range_t*)itm2;
            if (1 == range->cnt) {
                asprintf(&tmp2, "%s%d,", tmp, range->start);
            } else {
                asprintf(&tmp2, "%s%d-%d,", tmp, range->start, range->start + range->cnt - 1);
            }
            free(tmp);
            tmp = tmp2;
            OBJ_RELEASE(range);
        }
        /* replace the final comma */
        tmp[strlen(tmp)-1] = ']';
        if (NULL != ndreg->suffix) {
            /* add in the suffix, if provided */
            asprintf(&tmp2, "%s%s", tmp, ndreg->suffix);
            free(tmp);
            tmp = tmp2;
        }
        opal_argv_append_nosize(&regexargs, tmp);
        free(tmp);
        OBJ_RELEASE(ndreg);
    }
    
    /* assemble final result */
    *regexp = opal_argv_join(regexargs, ',');
    /* cleanup */
    opal_argv_free(regexargs);

    OBJ_DESTRUCT(&nodeids);


    return ORTE_SUCCESS;
}
コード例 #6
0
/* This Async event thread is handling all async event of
 * all btls/devices in openib component
 */
void* btl_openib_async_thread(void * async)
{
    int rc;
    int i;
    struct mca_btl_openib_async_poll devices_poll;
    opal_list_t ignore_qp_err_list;

    OBJ_CONSTRUCT(&ignore_qp_err_list, opal_list_t);

    if (OMPI_SUCCESS != btl_openib_async_poll_init(&devices_poll)) {
        BTL_ERROR(("Fatal error, stoping asynch event thread"));
        pthread_exit(&return_status);
    }

    while(1) {
        rc = poll(devices_poll.async_pollfd, devices_poll.active_poll_size, -1);
        if (rc < 0) {
            if (errno != EINTR) {
                BTL_ERROR(("Poll failed.  Fatal error, stoping asynch event thread"));
                pthread_exit(&return_status);
            } else {
                /* EINTR - we got interupt */
                continue;
            }
        }
        for(i = 0; i < devices_poll.active_poll_size; i++) {
            switch (devices_poll.async_pollfd[i].revents) {
                case 0:
                    /* no events */
                    break;
                case POLLIN:
#if defined(__SVR4) && defined(__sun)
                /*
                 * Need workaround for Solaris IB user verbs since
                 * "Poll on IB async fd returns POLLRDNORM revent even though it is masked out"
                 */
                case POLLIN | POLLRDNORM:
#endif
                    /* Processing our event */
                    if (0 == i) {
                        /* 0 poll we use for comunication with main thread */
                        if (OMPI_SUCCESS != btl_openib_async_commandh(&devices_poll,
								      &ignore_qp_err_list)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    } else {
                        /* We get device event */
                        if (btl_openib_async_deviceh(&devices_poll, i,
						     &ignore_qp_err_list)) {
                            free(devices_poll.async_pollfd);
                            BTL_ERROR(("Failed to process async thread process.  "
                                        "Fatal error, stoping asynch event thread"));
                            pthread_exit(&return_status);
                        }
                    }
                    break;
                default:
                    /* Get event other than POLLIN
                     * this case should not never happend */
                    BTL_ERROR(("Got unexpected event %d.  "
                               "Fatal error, stoping asynch event thread",
                               devices_poll.async_pollfd[i].revents));
                    free(devices_poll.async_pollfd);
                    pthread_exit(&return_status);
            }
        }
    }
    return PTHREAD_CANCELED;
}
コード例 #7
0
static int rank_by(orte_job_t *jdata,
                   orte_app_context_t *app,
                   opal_list_t *nodes,
                   hwloc_obj_type_t target,
                   unsigned cache_level)
{
    hwloc_obj_t obj;
    int num_objs, i, j, rc;
    orte_vpid_t num_ranked=0;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_vpid_t vpid;
    int cnt;
    opal_pointer_array_t objs;
    bool all_done;
    opal_list_item_t *item;
    hwloc_obj_t locale;

    if (ORTE_RANKING_SPAN & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_span(jdata, app, nodes, target, cache_level);
    } else if (ORTE_RANKING_FILL & ORTE_GET_RANKING_DIRECTIVE(jdata->map->ranking)) {
        return rank_fill(jdata, app, nodes, target, cache_level);
    }

    /* if ranking is not spanned or filled, then we
     * default to assign ranks sequentially across
     * target objects within a node until that node
     * is fully ranked, and then move on to the next
     * node
     *
     *        Node 0                Node 1
     *    Obj 0     Obj 1       Obj 0     Obj 1
     *     0 2       1 3         8 10      9 11
     *     4 6       5 7        12 14     13 15
     */

    /* setup the pointer array */
    OBJ_CONSTRUCT(&objs, opal_pointer_array_t);
    opal_pointer_array_init(&objs, 2, INT_MAX, 2);

    vpid = jdata->num_procs;
    cnt = 0;
    for (item = opal_list_get_first(nodes);
         item != opal_list_get_end(nodes);
         item = opal_list_get_next(item)) {
        node = (orte_node_t*)item;
        /* get the number of objects - only consider those we can actually use */
        num_objs = opal_hwloc_base_get_nbobjs_by_type(node->topology, target,
                                                      cache_level, OPAL_HWLOC_AVAILABLE);
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:rank_by: found %d objects on node %s with %d procs",
                            num_objs, node->name, (int)node->num_procs);
        if (0 == num_objs) {
            return ORTE_ERR_NOT_SUPPORTED;
        }
        /* collect all the objects */
        for (i=0; i < num_objs; i++) {
            obj = opal_hwloc_base_get_obj_by_type(node->topology, target,
                                                  cache_level, i, OPAL_HWLOC_AVAILABLE);
            opal_pointer_array_set_item(&objs, i, obj);
        }

        /* cycle across the objects, assigning a proc to each one,
         * until all procs have been assigned - unfortunately, since
         * more than this job may be mapped onto a node, the number
         * of procs on the node can't be used to tell us when we
         * are done. Instead, we have to just keep going until all
         * procs are ranked - which means we have to make one extra
         * pass thru the loop
         *
         * Perhaps someday someone will come up with a more efficient
         * algorithm, but this works for now.
         */
        all_done = false;
        while (!all_done && cnt < app->num_procs) {
            all_done = true;
            /* cycle across the objects */
            for (i=0; i < num_objs && cnt < app->num_procs; i++) {
                obj = (hwloc_obj_t)opal_pointer_array_get_item(&objs, i);

                /* find the next proc on this object */
                for (j=0; j < node->procs->size && cnt < app->num_procs; j++) {
                    if (NULL == (proc = (orte_proc_t*)opal_pointer_array_get_item(node->procs, j))) {
                        continue;
                    }
                    /* ignore procs from other jobs */
                    if (proc->name.jobid != jdata->jobid) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by skipping proc %s - from another job, num_ranked %d",
                                            ORTE_NAME_PRINT(&proc->name), num_ranked);
                        continue;
                    }
                    /* ignore procs that are already ranked */
                    if (ORTE_VPID_INVALID != proc->name.vpid) {
                        continue;
                    }
                    /* ignore procs from other apps */
                    if (proc->app_idx != app->idx) {
                        continue;
                    }
                    if (!orte_get_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, (void**)&locale, OPAL_PTR)) {
                        continue;
                    }
                    /* ignore procs on other objects */
                    if (!hwloc_bitmap_intersects(obj->cpuset, locale->cpuset)) {
                        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                            "mca:rmaps:rank_by: proc at position %d is not on object %d",
                                            j, i);
                        continue;
                    }
                    proc->name.vpid = vpid++;
                    if (0 == cnt) {
                        app->first_rank = proc->name.vpid;
                    }
                    cnt++;
                    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                                        "mca:rmaps:rank_by: assigned rank %s", ORTE_VPID_PRINT(proc->name.vpid));
                    /* insert the proc into the jdata array - no harm if already there */
                    if (ORTE_SUCCESS != (rc = opal_pointer_array_set_item(jdata->procs, proc->name.vpid, proc))) {
                        ORTE_ERROR_LOG(rc);
                        return rc;
                    }
                   /* flag that one was mapped */
                    all_done = false;
                    /* track where the highest vpid landed - this is our
                     * new bookmark
                     */
                    jdata->bookmark = node;
                    /* move to next object */
                    break;
                }
            }
        }
    }

    /* cleanup */
    OBJ_DESTRUCT(&objs);

    return ORTE_SUCCESS;
}
コード例 #8
0
ファイル: proc.c プロジェクト: IanYXXL/A1
int
ompi_proc_pack(ompi_proc_t **proclist, int proclistsize,
               bool full_info,
               opal_buffer_t* buf)
{
    int i, rc;
    
    OPAL_THREAD_LOCK(&ompi_proc_lock);
    
    /* cycle through the provided array, packing the OMPI level
     * data for each proc. This data may or may not be included
     * in any subsequent modex operation, so we include it here
     * to ensure completion of a connect/accept handshake. See
     * the ompi/mca/dpm framework for an example of where and how
     * this info is used.
     *
     * Eventually, we will review the procedures that call this
     * function to see if duplication of communication can be
     * reduced. For now, just go ahead and pack the info so it
     * can be sent.
     */
    for (i=0; i<proclistsize; i++) {
        rc = opal_dss.pack(buf, &(proclist[i]->proc_name), 1, OMPI_NAME);
        if(rc != OPAL_SUCCESS) {
            OMPI_ERROR_LOG(rc);
            OPAL_THREAD_UNLOCK(&ompi_proc_lock);
            return rc;
        }
        if (full_info) {
            int32_t num_entries;
            opal_value_t *kv;
            opal_list_t data;

            /* fetch all global info we know about the peer - while
             * the remote procs may already know some of it, we cannot
             * be certain they do. So we must include a full dump of
             * everything we know about this proc, excluding INTERNAL
             * data that each process computes about its peers
             */
            OBJ_CONSTRUCT(&data, opal_list_t);
            rc = opal_db.fetch_multiple((opal_identifier_t*)&proclist[i]->proc_name,
                                        OPAL_SCOPE_GLOBAL, NULL, &data);
            if (OPAL_SUCCESS != rc) {
                OMPI_ERROR_LOG(rc);
                num_entries = 0;
            } else {
                /* count the number of entries we will send */
                num_entries = opal_list_get_size(&data);
            }

            /* put the number of entries into the buffer */
            rc = opal_dss.pack(buf, &num_entries, 1, OPAL_INT32);
            if (OPAL_SUCCESS != rc) {
                OMPI_ERROR_LOG(rc);
                break;
            }
    
            /* if there are entries, store them */
            while (NULL != (kv = (opal_value_t*)opal_list_remove_first(&data))) {
                if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &kv, 1, OPAL_VALUE))) {
                    OMPI_ERROR_LOG(rc);
                    break;
                }
                OBJ_RELEASE(kv);
            }
            OBJ_DESTRUCT(&data);

        } else {
            rc = opal_dss.pack(buf, &(proclist[i]->proc_arch), 1, OPAL_UINT32);
            if(rc != OPAL_SUCCESS) {
                OMPI_ERROR_LOG(rc);
                OPAL_THREAD_UNLOCK(&ompi_proc_lock);
                return rc;
            }
            rc = opal_dss.pack(buf, &(proclist[i]->proc_hostname), 1, OPAL_STRING);
            if(rc != OPAL_SUCCESS) {
                OMPI_ERROR_LOG(rc);
                OPAL_THREAD_UNLOCK(&ompi_proc_lock);
                return rc;
            }
        }
    }
    OPAL_THREAD_UNLOCK(&ompi_proc_lock);
    return OMPI_SUCCESS;
}
コード例 #9
0
/*
 * Scan down the list of successfully opened components and query each of
 * them (the opened list will be one or more components.  If the user
 * requested a specific component, it will be the only component in the
 * opened list).  Create and populate the available list of all
 * components who indicate that they want to be considered for selection.
 * Close all components who do not want to be considered for selection,
 * and destroy the opened list.
 *
 * Also find the basic component while we're doing all of this, and save
 * it in a global variable so that we can find it easily later (e.g.,
 * during scope selection).
 */
int mca_coll_base_find_available(bool enable_progress_threads,
                                 bool enable_mpi_threads)
{
  bool found = false;
  mca_base_component_priority_list_item_t *entry;
  opal_list_item_t *p;
  const mca_base_component_t *component;

  /* Initialize the list */

  OBJ_CONSTRUCT(&mca_coll_base_components_available, opal_list_t);
  mca_coll_base_components_available_valid = true;

  /* The list of components that we should check has already been
     established in mca_coll_base_open. */
  
  for (found = false, 
         p = opal_list_remove_first(&mca_coll_base_components_opened);
       p != NULL;
       p = opal_list_remove_first(&mca_coll_base_components_opened)) {
    component = ((mca_base_component_list_item_t *) p)->cli_component;

    /* Call a subroutine to do the work, because the component may
       represent different versions of the coll MCA. */
    
    entry = OBJ_NEW(mca_base_component_priority_list_item_t);
    entry->super.cli_component = component;
    entry->cpli_priority = 0;
    if (OMPI_SUCCESS == init_query(component, entry, 
                                   enable_progress_threads,
                                   enable_mpi_threads)) {
        opal_list_append(&mca_coll_base_components_available, 
                         (opal_list_item_t *) entry);
        found = true;
    } else {
      
      /* If the component doesn't want to run, then close it.  It's
         already had its close() method invoked; now close it out of
         the DSO repository (if it's there). */
      
      mca_base_component_repository_release(component);
      OBJ_RELEASE(entry);
    }

    /* Free the entry from the "opened" list */

    OBJ_RELEASE(p);
  }

  /* The opened list is now no longer useful and we can free it */
    
  OBJ_DESTRUCT(&mca_coll_base_components_opened);
  mca_coll_base_components_opened_valid = false;

  /* If we have no collective components available, it's an error.
     Thanks for playing! */
    
  if (!found) {
    /* Need to free all items in the list */
    OBJ_DESTRUCT(&mca_coll_base_components_available);
    mca_coll_base_components_available_valid = false;
    opal_output_verbose(10, mca_coll_base_output,
                       "coll:find_available: no coll components available!");
    orte_show_help("help-mca-base", "find-available:none-found", true,
                   "coll");
    return OMPI_ERROR;
  }

  /* All done */

  return OMPI_SUCCESS;
}
コード例 #10
0
int orte_util_filter_dash_host_nodes(opal_list_t *nodes,
                                     char** host_argv,
                                     bool remove)
{
    opal_list_item_t* item;
    bool found;
    opal_list_item_t *next;
    orte_std_cntr_t i, j, len_mapped_node=0;
    int rc;
    char **mapped_nodes = NULL;
    orte_node_t *node, *hnp_node;
    int num_empty=0;
    opal_list_t keep;
    bool want_all_empty=false;
    
    /* if the incoming node list is empty, then there
     * is nothing to filter!
     */
    if (opal_list_is_empty(nodes)) {
        return ORTE_SUCCESS;
    }

    if (ORTE_SUCCESS != (rc = parse_dash_host(&mapped_nodes, host_argv))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    /* Did we find anything? If not, then do nothing */
    if (NULL == mapped_nodes) {
        return ORTE_SUCCESS;
    }
    
    /* NOTE: The following logic is based on knowing that
     * any node can only be included on the incoming
     * nodes list ONCE.
     */

    /* get the hnp node's info */
    hnp_node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        
    len_mapped_node = opal_argv_count(mapped_nodes);
    /* setup a working list so we can put the final list
     * of nodes in order. This way, if the user specifies a
     * set of nodes, we will use them in the order in which
     * they were specifed. Note that empty node requests
     * will always be appended to the end
     */
    OBJ_CONSTRUCT(&keep, opal_list_t);
    
    for (i = 0; i < len_mapped_node; ++i) {
        /* check if we are supposed to add some number of empty
         * nodes here
         */
        if ('*' == mapped_nodes[i][0]) {
            /* if there is a number after the '*', then we are
             * to insert a specific # of nodes
             */
            if ('\0' == mapped_nodes[i][1]) {
                /* take all empty nodes from the list */
                num_empty = INT_MAX;
                want_all_empty = true;
            } else {
                /* extract number of nodes to take */
                num_empty = strtol(&mapped_nodes[i][1], NULL, 10);
            }
            /* search for empty nodes and take them */
            item = opal_list_get_first(nodes);
            while (0 < num_empty && item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* see if this node is empty */
                if (0 == node->slots_inuse) {
                    /* check to see if it is specified later */
                    for (j=i+1; j < len_mapped_node; j++) {
                        if (0 == strcmp(mapped_nodes[j], node->name)) {
                            /* specified later - skip this one */
                            goto skipnode;
                        }
                    }
                    if (remove) {
                        /* remove item from list */
                        opal_list_remove_item(nodes, item);
                        /* xfer to keep list */
                        opal_list_append(&keep, item);
                    } else {
                        /* mark the node as found */
                        node->mapped = true;
                    }
                    --num_empty;
                }
            skipnode:
                item = next;
            }
        } else {
            /* we are looking for a specific node on the list
             * we have a match if one of two conditions is met:
             * 1. the node_name and mapped_nodes directly match
             * 2. the node_name is the local system name AND
             *    either the mapped_node is "localhost" OR it
             *    is a local interface as found by opal_ifislocal
             */
            item = opal_list_get_first(nodes);
            while (item != opal_list_get_end(nodes)) {
                next = opal_list_get_next(item);  /* save this position */
                node = (orte_node_t*)item;
                /* search -host list to see if this one is found */
                found = false;
                if (0 == strcmp(node->name, mapped_nodes[i]) ||
                    (0 == strcmp(node->name, hnp_node->name) &&
                    (0 == strcasecmp(mapped_nodes[i], "localhost") || opal_ifislocal(mapped_nodes[i])))) {
                    if (remove) {
                        /* remove item from list */
                        opal_list_remove_item(nodes, item);
                        /* xfer to keep list */
                        opal_list_append(&keep, item);
                    } else {
                        /* mark the node as found */
                        node->mapped = true;
                    }
                    break;
                }
                item = next;
            }
        }
        /* done with the mapped entry */
        free(mapped_nodes[i]);
        mapped_nodes[i] = NULL;
    }

    /* was something specified that was -not- found? */
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            orte_show_help("help-dash-host.txt", "not-all-mapped-alloc",
                           true, mapped_nodes[i]);
            rc = ORTE_ERR_SILENT;
            goto cleanup;
        }
    }
    
    if (!remove) {
        /* all done */
        rc = ORTE_SUCCESS;
        goto cleanup;
    }

    /* clear the rest of the nodes list */
    while (NULL != (item = opal_list_remove_first(nodes))) {
        OBJ_RELEASE(item);
    }
    
    /* the nodes list has been cleared - rebuild it in order */
    while (NULL != (item = opal_list_remove_first(&keep))) {
        opal_list_append(nodes, item);
    }
    
    /* did they ask for more than we could provide */
    if (!want_all_empty && 0 < num_empty) {
        orte_show_help("help-dash-host.txt", "dash-host:not-enough-empty",
                       true, num_empty);
        rc = ORTE_ERR_SILENT;
        goto cleanup;
    }
    
    rc = ORTE_SUCCESS;
    /* done filtering existing list */
    
cleanup:
    for (i=0; i < len_mapped_node; i++) {
        if (NULL != mapped_nodes[i]) {
            free(mapped_nodes[i]);
            mapped_nodes[i] = NULL;
        }
    }
    if (NULL != mapped_nodes) {
        free(mapped_nodes);
    }
    
    return rc;
}
コード例 #11
0
int ompi_mpi_init(int argc, char **argv, int requested, int *provided)
{
    int ret;
    ompi_proc_t** procs;
    size_t nprocs;
    char *error = NULL;
    bool timing = false;
    int param, value;
    struct timeval ompistart, ompistop;
    char *event_val = NULL;
    bool orte_setup = false;
    orte_grpcomm_collective_t *coll;
    char *cmd=NULL, *av=NULL;

    /* bitflag of the thread level support provided. To be used
     * for the modex in order to work in heterogeneous environments. */
    uint8_t threadlevel_bf; 

    /* Indicate that we have *started* MPI_INIT*.  MPI_FINALIZE has
       something sorta similar in a static local variable in
       ompi_mpi_finalize(). */
    ompi_mpi_init_started = true;

    /* Setup enough to check get/set MCA params */

    if (OPAL_SUCCESS != (ret = opal_init_util(&argc, &argv))) {
        error = "ompi_mpi_init: opal_init_util failed";
        goto error;
    }

    if (OPAL_SUCCESS != (ret = opal_arch_set_fortran_logical_size(sizeof(ompi_fortran_logical_t)))) {
        error = "ompi_mpi_init: opal_arch_set_fortran_logical_size failed";
        goto error;
    }

    /* _After_ opal_init_util() but _before_ orte_init(), we need to
       set an MCA param that tells libevent that it's ok to use any
       mechanism in libevent that is available on this platform (e.g.,
       epoll and friends).  Per opal/event/event.s, we default to
       select/poll -- but we know that MPI processes won't be using
       pty's with the event engine, so it's ok to relax this
       constraint and let any fd-monitoring mechanism be used. */
    ret = mca_base_param_reg_string_name("opal", "event_include",
                                         "Internal orted MCA param: tell opal_init() to use a specific mechanism in libevent",
                                         false, false, "all", &event_val);
    if (ret >= 0) {
        /* We have to explicitly "set" the MCA param value here
           because libevent initialization will re-register the MCA
           param and therefore override the default. Setting the value
           here puts the desired value ("all") in different storage
           that is not overwritten if/when the MCA param is
           re-registered. This is unless the user has specified a different
           value for this MCA parameter. Make sure we check to see if the
           default is specified before forcing "all" in case that is not what
           the user desires. Note that we do *NOT* set this value as an
           environment variable, just so that it won't be inherited by
           any spawned processes and potentially cause unintented
           side-effects with launching ORTE tools... */
        if (0 == strcmp("all", event_val)) {
            mca_base_param_set_string(ret, "all");
        }
    }

    if( NULL != event_val ) {
        free(event_val);
        event_val = NULL;
    }

    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        gettimeofday(&ompistart, NULL);
    }
    
    /* if we were not externally started, then we need to setup
     * some envars so the MPI_INFO_ENV can get the cmd name
     * and argv (but only if the user supplied a non-NULL argv!), and
     * the requested thread level
     */
    if (NULL == getenv("OMPI_COMMAND") && NULL != argv && NULL != argv[0]) {
        asprintf(&cmd, "OMPI_COMMAND=%s", argv[0]);
        putenv(cmd);
    }
    if (NULL == getenv("OMPI_ARGV") && 1 < argc) {
        char *tmp;
        tmp = opal_argv_join(&argv[1], ' ');
        asprintf(&av, "OMPI_ARGV=%s", tmp);
        free(tmp);
        putenv(av);
    }

    /* Setup ORTE - note that we are an MPI process  */
    if (ORTE_SUCCESS != (ret = orte_init(NULL, NULL, ORTE_PROC_MPI))) {
        error = "ompi_mpi_init: orte_init failed";
        goto error;
    }
    orte_setup = true;
    
    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init [%ld]: time from start to completion of orte_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OPAL_HAVE_HWLOC
    /* if hwloc is available but didn't get setup for some
     * reason, do so now
     */
    if (NULL == opal_hwloc_topology) {
        if (OPAL_SUCCESS != (ret = opal_hwloc_base_get_topology())) {
            error = "Topology init";
            goto error;
        }
    }
#endif

    /* Register errhandler callback with orte errmgr */
    if (NULL != orte_errmgr.set_fault_callback) {
        orte_errmgr.set_fault_callback(ompi_errhandler_runtime_callback);
    }

    /* Figure out the final MPI thread levels.  If we were not
       compiled for support for MPI threads, then don't allow
       MPI_THREAD_MULTIPLE.  Set this stuff up here early in the
       process so that other components can make decisions based on
       this value. */
    /**
     * These values are monotonic; MPI_THREAD_SINGLE < MPI_THREAD_FUNNELED
     *                             < MPI_THREAD_SERIALIZED < MPI_THREAD_MULTIPLE.
     * If possible, the call will return provided = required. Failing this,
     * the call will return the least supported level such that
     * provided > required. Finally, if the user requirement cannot be
     * satisfied, then the call will return in provided the highest
     * supported level.
     */
    ompi_mpi_thread_requested = requested;

    if (OMPI_ENABLE_THREAD_MULTIPLE == 1) {
        ompi_mpi_thread_provided = *provided = requested;
        ompi_mpi_main_thread = opal_thread_get_self();
    } else {
        if (MPI_THREAD_MULTIPLE == requested) {
            ompi_mpi_thread_provided = *provided = MPI_THREAD_SERIALIZED;
        } else {
            ompi_mpi_thread_provided = *provided = requested;
        }
        ompi_mpi_main_thread = (OPAL_ENABLE_MULTI_THREADS ? opal_thread_get_self() : NULL);
    }

    ompi_mpi_thread_multiple = (ompi_mpi_thread_provided == 
                                MPI_THREAD_MULTIPLE);

    /* determine the bitflag belonging to the threadlevel_support provided */
    memset ( &threadlevel_bf, 0, sizeof(uint8_t));
    OMPI_THREADLEVEL_SET_BITFLAG ( ompi_mpi_thread_provided, threadlevel_bf );

    /* add this bitflag to the modex */
    if ( OMPI_SUCCESS != (ret = ompi_modex_send_string("MPI_THREAD_LEVEL", &threadlevel_bf, sizeof(uint8_t)))) {
        error = "ompi_mpi_init: modex send thread level";
        goto error;
    }

    /* Once we've joined the RTE, see if any MCA parameters were
       passed to the MPI level */

    if (OMPI_SUCCESS != (ret = ompi_mpi_register_params())) {
        error = "mca_mpi_register_params() failed";
        goto error;
    }

    /* initialize datatypes. This step should be done early as it will
     * create the local convertor and local arch used in the proc
     * init.
     */
    if (OMPI_SUCCESS != (ret = ompi_datatype_init())) {
        error = "ompi_datatype_init() failed";
        goto error;
    }

    /* Initialize OMPI procs */
    if (OMPI_SUCCESS != (ret = ompi_proc_init())) {
        error = "mca_proc_init() failed";
        goto error;
    }

    /* Initialize the op framework. This has to be done *after*
       ddt_init, but befor mca_coll_base_open, since some collective
       modules (e.g., the hierarchical coll component) may need ops in
       their query function. */
    if (OMPI_SUCCESS != (ret = ompi_op_base_open())) {
        error = "ompi_op_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != 
        (ret = ompi_op_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                           OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_op_base_find_available() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_op_init())) {
        error = "ompi_op_init() failed";
        goto error;
    }

    /* Open up MPI-related MCA components */

    if (OMPI_SUCCESS != (ret = mca_allocator_base_open())) {
        error = "mca_allocator_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_rcache_base_open())) {
        error = "mca_rcache_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_mpool_base_open())) {
        error = "mca_mpool_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_pml_base_open())) {
        error = "mca_pml_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = mca_coll_base_open())) {
        error = "mca_coll_base_open() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_osc_base_open())) {
        error = "ompi_osc_base_open() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_open())) {
        error = "ompi_crcp_base_open() failed";
        goto error;
    }
#endif

    /* In order to reduce the common case for MPI apps (where they
       don't use MPI-2 IO or MPI-1 topology functions), the io and
       topo frameworks are initialized lazily, at the first use of
       relevant functions (e.g., MPI_FILE_*, MPI_CART_*, MPI_GRAPH_*),
       so they are not opened here. */

    /* Select which MPI components to use */

    if (OMPI_SUCCESS != 
        (ret = mca_mpool_base_init(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_mpool_base_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_pml_base_select(OMPI_ENABLE_PROGRESS_THREADS,
                                   OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_pml_base_select() failed";
        goto error;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from completion of orte_init to modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* exchange connection info - this function also acts as a barrier
     * as it will not return until the exchange is complete
     */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_modex;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.modex(coll))) {
        error = "orte_grpcomm_modex failed";
        goto error;
    }
    /* wait for modex to complete - this may be moved anywhere in mpi_init
     * so long as it occurs prior to calling a function that needs
     * the modex info!
     */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute modex %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

    /* select buffered send allocator component to be used */
    ret=mca_pml_base_bsend_init(OMPI_ENABLE_THREAD_MULTIPLE);
    if( OMPI_SUCCESS != ret ) {
        error = "mca_pml_base_bsend_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "mca_coll_base_find_available() failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = ompi_osc_base_find_available(OMPI_ENABLE_PROGRESS_THREADS,
                                            OMPI_ENABLE_THREAD_MULTIPLE))) {
        error = "ompi_osc_base_find_available() failed";
        goto error;
    }

#if OPAL_ENABLE_FT_CR == 1
    if (OMPI_SUCCESS != (ret = ompi_crcp_base_select() ) ) {
        error = "ompi_crcp_base_select() failed";
        goto error;
    }
#endif

    /* io and topo components are not selected here -- see comment
       above about the io and topo frameworks being loaded lazily */

    /* Initialize each MPI handle subsystem */
    /* initialize requests */
    if (OMPI_SUCCESS != (ret = ompi_request_init())) {
        error = "ompi_request_init() failed";
        goto error;
    }

    if (OMPI_SUCCESS != (ret = ompi_message_init())) {
        error = "ompi_message_init() failed";
        goto error;
    }

    /* initialize info */
    if (OMPI_SUCCESS != (ret = ompi_info_init())) {
        error = "ompi_info_init() failed";
        goto error;
    }

    /* initialize error handlers */
    if (OMPI_SUCCESS != (ret = ompi_errhandler_init())) {
        error = "ompi_errhandler_init() failed";
        goto error;
    }

    /* initialize error codes */
    if (OMPI_SUCCESS != (ret = ompi_mpi_errcode_init())) {
        error = "ompi_mpi_errcode_init() failed";
        goto error;
    }
    
    /* initialize internal error codes */
    if (OMPI_SUCCESS != (ret = ompi_errcode_intern_init())) {
        error = "ompi_errcode_intern_init() failed";
        goto error;
    }
     
    /* initialize groups  */
    if (OMPI_SUCCESS != (ret = ompi_group_init())) {
        error = "ompi_group_init() failed";
        goto error;
    }

    /* initialize communicators */
    if (OMPI_SUCCESS != (ret = ompi_comm_init())) {
        error = "ompi_comm_init() failed";
        goto error;
    }

    /* initialize file handles */
    if (OMPI_SUCCESS != (ret = ompi_file_init())) {
        error = "ompi_file_init() failed";
        goto error;
    }

    /* initialize windows */
    if (OMPI_SUCCESS != (ret = ompi_win_init())) {
        error = "ompi_win_init() failed";
        goto error;
    }

    /* initialize attribute meta-data structure for comm/win/dtype */
    if (OMPI_SUCCESS != (ret = ompi_attr_init())) {
        error = "ompi_attr_init() failed";
        goto error;
    }

    /* identify the architectures of remote procs and setup
     * their datatype convertors, if required
     */
    if (OMPI_SUCCESS != (ret = ompi_proc_complete_init())) {
        error = "ompi_proc_complete_init failed";
        goto error;
    }

    /* If thread support was enabled, then setup OPAL to allow for
       them. */
    if ((OMPI_ENABLE_PROGRESS_THREADS == 1) ||
        (*provided != MPI_THREAD_SINGLE)) {
        opal_set_using_threads(true);
    }

    /* start PML/BTL's */
    ret = MCA_PML_CALL(enable(true));
    if( OMPI_SUCCESS != ret ) {
        error = "PML control failed";
        goto error;
    }

    /* add all ompi_proc_t's to PML */
    if (NULL == (procs = ompi_proc_world(&nprocs))) {
        error = "ompi_proc_world() failed";
        goto error;
    }
    ret = MCA_PML_CALL(add_procs(procs, nprocs));
    free(procs);
    /* If we got "unreachable", then print a specific error message.
       Otherwise, if we got some other failure, fall through to print
       a generic message. */
    if (OMPI_ERR_UNREACH == ret) {
        orte_show_help("help-mpi-runtime",
                       "mpi_init:startup:pml-add-procs-fail", true);
        error = NULL;
        goto error;
    } else if (OMPI_SUCCESS != ret) {
        error = "PML add procs failed";
        goto error;
    }

    MCA_PML_CALL(add_comm(&ompi_mpi_comm_world.comm));
    MCA_PML_CALL(add_comm(&ompi_mpi_comm_self.comm));

    /*
     * Dump all MCA parameters if requested
     */
    if (ompi_mpi_show_mca_params) {
        ompi_show_all_mca_params(ompi_mpi_comm_world.comm.c_my_rank, 
                                 nprocs, 
                                 orte_process_info.nodename);
    }

    /* Do we need to wait for a debugger? */
    ompi_wait_for_debugger();
    
    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from modex to first barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }
    
    /* wait for everyone to reach this point */
    coll = OBJ_NEW(orte_grpcomm_collective_t);
    coll->id = orte_process_info.peer_init_barrier;
    if (ORTE_SUCCESS != (ret = orte_grpcomm.barrier(coll))) {
        error = "orte_grpcomm_barrier failed";
        goto error;
    }
    /* wait for barrier to complete */
    while (coll->active) {
        opal_progress();  /* block in progress pending events */
    }
    OBJ_RELEASE(coll);

    /* check for timing request - get stop time and report elapsed
       time if so, then start the clock again */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time to execute barrier %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
        gettimeofday(&ompistart, NULL);
    }

#if OMPI_ENABLE_PROGRESS_THREADS == 0
    /* Start setting up the event engine for MPI operations.  Don't
       block in the event library, so that communications don't take
       forever between procs in the dynamic code.  This will increase
       CPU utilization for the remainder of MPI_INIT when we are
       blocking on ORTE-level events, but may greatly reduce non-TCP
       latency. */
    opal_progress_set_event_flag(OPAL_EVLOOP_NONBLOCK);
#endif
    
    /* wire up the mpi interface, if requested.  Do this after the
       non-block switch for non-TCP performance.  Do before the
       polling change as anyone with a complex wire-up is going to be
       using the oob. */
    if (OMPI_SUCCESS != (ret = ompi_init_preconnect_mpi())) {
        error = "ompi_mpi_do_preconnect_all() failed";
        goto error;
    }

    /* Setup the publish/subscribe (PUBSUB) framework */
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_open())) {
        error = "ompi_pubsub_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_pubsub_base_select())) {
        error = "ompi_pubsub_base_select() failed";
        goto error;
    }
    
    /* Setup the dynamic process management (DPM) framework */
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_open())) {
        error = "ompi_dpm_base_open() failed";
        goto error;
    }
    if (OMPI_SUCCESS != (ret = ompi_dpm_base_select())) {
        error = "ompi_dpm_base_select() failed";
        goto error;
    }


    /* Determine the overall threadlevel support of all processes 
       in MPI_COMM_WORLD. This has to be done before calling 
       coll_base_comm_select, since some of the collective components
       e.g. hierarch, might create subcommunicators. The threadlevel
       requested by all processes is required in order to know
       which cid allocation algorithm can be used. */
    if ( OMPI_SUCCESS != 
	 ( ret = ompi_comm_cid_init ())) {
	error = "ompi_mpi_init: ompi_comm_cid_init failed";
	goto error;
    }

    /* Init coll for the comms. This has to be after dpm_base_select, 
       (since dpm.mark_dyncomm is not set in the communicator creation
       function else), but before dpm.dyncom_init, since this function
       might require collective for the CID allocation. */
    if (OMPI_SUCCESS !=
        (ret = mca_coll_base_comm_select(MPI_COMM_WORLD))) {
        error = "mca_coll_base_comm_select(MPI_COMM_WORLD) failed";
        goto error;
    }

    if (OMPI_SUCCESS != 
        (ret = mca_coll_base_comm_select(MPI_COMM_SELF))) {
        error = "mca_coll_base_comm_select(MPI_COMM_SELF) failed";
        goto error;
    }


    
    /* Check whether we have been spawned or not.  We introduce that
       at the very end, since we need collectives, datatypes, ptls
       etc. up and running here.... */
    if (OMPI_SUCCESS != (ret = ompi_dpm.dyn_init())) {
        error = "ompi_comm_dyn_init() failed";
        goto error;
    }

    /*
     * Startup the Checkpoint/Restart Mech.
     * Note: Always do this so tools don't hang when
     * in a non-checkpointable build
     */
    if (OMPI_SUCCESS != (ret = ompi_cr_init())) {
        error = "ompi_cr_init";
        goto error;
    }

    /* Undo OPAL calling opal_progress_event_users_increment() during 
       opal_init, to get better latency when not using TCP.  Do 
       this *after* dyn_init, as dyn init uses lots of ORTE 
       communication and we don't want to hinder the performance of 
       that code. */ 
    opal_progress_event_users_decrement(); 

    /* see if yield_when_idle was specified - if so, use it */
    param = mca_base_param_find("mpi", NULL, "yield_when_idle");
    mca_base_param_lookup_int(param, &value);
    if (value < 0) {
        /* if no info is provided, just default to conservative */
        opal_progress_set_yield_when_idle(true);
    } else {
        /* info was provided, so set idle accordingly */
        opal_progress_set_yield_when_idle(value == 0 ? false : true);
    }
    
    param = mca_base_param_find("mpi", NULL, "event_tick_rate");
    mca_base_param_lookup_int(param, &value);
    /* negative value means use default - just don't do anything */
    if (value >= 0) {
        opal_progress_set_event_poll_rate(value);
    }

    /* At this point, we are fully configured and in MPI mode.  Any
       communication calls here will work exactly like they would in
       the user's code.  Setup the connections between procs and warm
       them up with simple sends, if requested */

    if (OMPI_SUCCESS != (ret = ompi_mpiext_init())) {
        error = "ompi_mpiext_init";
        goto error;
    }

    /* Fall through */
 error:
    if (ret != OMPI_SUCCESS) {
        /* Only print a message if one was not already printed */
        if (NULL != error) {
            const char *err_msg = opal_strerror(ret);
            /* If ORTE was not setup yet, don't use orte_show_help */
            if (orte_setup) {
                orte_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            } else {
                opal_show_help("help-mpi-runtime",
                               "mpi_init:startup:internal-failure", true,
                               "MPI_INIT", "MPI_INIT", error, err_msg, ret);
            }
        }
        return ret;
    }

    /* Initialize the registered datarep list to be empty */
    OBJ_CONSTRUCT(&ompi_registered_datareps, opal_list_t);

    /* Initialize the arrays used to store the F90 types returned by the
     *  MPI_Type_create_f90_XXX functions.
     */
    OBJ_CONSTRUCT( &ompi_mpi_f90_integer_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_integer_hashtable, 16 /* why not? */);

    OBJ_CONSTRUCT( &ompi_mpi_f90_real_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_real_hashtable, FLT_MAX_10_EXP);

    OBJ_CONSTRUCT( &ompi_mpi_f90_complex_hashtable, opal_hash_table_t);
    opal_hash_table_init(&ompi_mpi_f90_complex_hashtable, FLT_MAX_10_EXP);

    /* All done.  Wasn't that simple? */

    ompi_mpi_initialized = true;

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing && 0 == ORTE_PROC_MY_NAME->vpid) {
        gettimeofday(&ompistop, NULL);
        opal_output(0, "ompi_mpi_init[%ld]: time from barrier to complete mpi_init %ld usec",
                    (long)ORTE_PROC_MY_NAME->vpid,
                    (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                               (ompistop.tv_usec - ompistart.tv_usec)));
    }

    return MPI_SUCCESS;
}
コード例 #12
0
/* When working in this function, ALWAYS jump to "cleanup" if
 * you encounter an error so that orterun will be woken up and
 * the job can cleanly terminate
 */
static int plm_slurm_launch_job(orte_job_t *jdata)
{
    orte_app_context_t **apps;
    orte_node_t **nodes;
    orte_std_cntr_t n;
    orte_job_map_t *map;
    char *jobid_string = NULL;
    char *param;
    char **argv = NULL;
    int argc;
    int rc;
    char *tmp;
    char** env = NULL;
    char* var;
    char *nodelist_flat;
    char **nodelist_argv;
    char *name_string;
    char **custom_strings;
    int num_args, i;
    char *cur_prefix;
    struct timeval launchstart, launchstop;
    int proc_vpid_index;
    orte_jobid_t failed_job;
    bool failed_launch=true;
    bool using_regexp=false;

    if (NULL == jdata) {
	/* just launching debugger daemons */
	active_job = ORTE_JOBID_INVALID;
	goto launch_apps;
    }

    if (jdata->controls & ORTE_JOB_CONTROL_DEBUGGER_DAEMON) {
        /* debugger daemons */
        active_job = jdata->jobid;
        goto launch_apps;
    }

    if (jdata->controls & ORTE_JOB_CONTROL_LOCAL_SLAVE) {
        /* if this is a request to launch a local slave,
         * then we will not be launching an orted - we will
         * directly ssh the slave process itself. No mapping
         * is performed to support this - the caller must
         * provide all the info required to launch the job,
         * including the target hosts
         */
        if (!local_launch_available) {
            /* if we can't support this, then abort */
            orte_show_help("help-plm-slurm.txt", "no-local-slave-support", true);
            return ORTE_ERR_FAILED_TO_START;
        }
        return orte_plm_base_local_slave_launch(jdata);
    }
    
    /* if we are timing, record the start time */
    if (orte_timing) {
        gettimeofday(&orte_plm_globals.daemonlaunchstart, NULL);
    }
    
    /* flag the daemons as failing by default */
    failed_job = ORTE_PROC_MY_NAME->jobid;
    
    if (orte_timing) {
        if (0 != gettimeofday(&launchstart, NULL)) {
            opal_output(0, "plm_slurm: could not obtain job start time");
            launchstart.tv_sec = 0;
            launchstart.tv_usec = 0;
        }        
    }
    
    /* indicate the state of the launch */
    launching_daemons = true;
    
    /* setup the job */
    if (ORTE_SUCCESS != (rc = orte_plm_base_setup_job(jdata))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }

    OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                         "%s plm:slurm: launching job %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         ORTE_JOBID_PRINT(jdata->jobid)));
    
    /* set the active jobid */
     active_job = jdata->jobid;
    
    /* Get the map for this job */
    if (NULL == (map = orte_rmaps.get_job_map(active_job))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        rc = ORTE_ERR_NOT_FOUND;
        goto cleanup;
    }
    apps = (orte_app_context_t**)jdata->apps->addr;
    nodes = (orte_node_t**)map->nodes->addr;
        
    if (0 == map->num_new_daemons) {
        /* no new daemons required - just launch apps */
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:slurm: no new daemons to launch",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        goto launch_apps;
    }

    /* need integer value for command line parameter */
    asprintf(&jobid_string, "%lu", (unsigned long) jdata->jobid);

    /*
     * start building argv array
     */
    argv = NULL;
    argc = 0;

    /*
     * SLURM srun OPTIONS
     */

    /* add the srun command */
    opal_argv_append(&argc, &argv, "srun");

    /* Append user defined arguments to srun */
    if ( NULL != mca_plm_slurm_component.custom_args ) {
        custom_strings = opal_argv_split(mca_plm_slurm_component.custom_args, ' ');
        num_args       = opal_argv_count(custom_strings);
        for (i = 0; i < num_args; ++i) {
            opal_argv_append(&argc, &argv, custom_strings[i]);
        }
        opal_argv_free(custom_strings);
    }

    asprintf(&tmp, "--nodes=%lu", (unsigned long) map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);

    asprintf(&tmp, "--ntasks=%lu", (unsigned long) map->num_new_daemons);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);

    /* alert us if any orteds die during startup */
    opal_argv_append(&argc, &argv, "--kill-on-bad-exit");

    /* create nodelist */
    nodelist_argv = NULL;

    for (n=0; n < map->num_nodes; n++ ) {
        /* if the daemon already exists on this node, then
         * don't include it
         */
        if (nodes[n]->daemon_launched) {
            continue;
        }
        
        /* otherwise, add it to the list of nodes upon which
         * we need to launch a daemon
         */
        opal_argv_append_nosize(&nodelist_argv, nodes[n]->name);
    }
    if (0 == opal_argv_count(nodelist_argv)) {
        orte_show_help("help-plm-slurm.txt", "no-hosts-in-list", true);
        rc = ORTE_ERR_FAILED_TO_START;
        goto cleanup;
    }
    nodelist_flat = opal_argv_join(nodelist_argv, ',');
    opal_argv_free(nodelist_argv);
    asprintf(&tmp, "--nodelist=%s", nodelist_flat);
    opal_argv_append(&argc, &argv, tmp);
    free(tmp);

    OPAL_OUTPUT_VERBOSE((2, orte_plm_globals.output,
                         "%s plm:slurm: launching on nodes %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), nodelist_flat));
    
    /*
     * ORTED OPTIONS
     */

    /* add the daemon command (as specified by user) */
    orte_plm_base_setup_orted_cmd(&argc, &argv);
    
   /* Add basic orted command line options, including debug flags */
    orte_plm_base_orted_append_basic_args(&argc, &argv,
                                          "slurm", &proc_vpid_index,
                                          false, nodelist_flat);
    free(nodelist_flat);

    /* tell the new daemons the base of the name list so they can compute
     * their own name on the other end
     */
    rc = orte_util_convert_vpid_to_string(&name_string, map->daemon_vpid_start);
    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm_slurm: unable to get daemon vpid as string");
        goto cleanup;
    }

    free(argv[proc_vpid_index]);
    argv[proc_vpid_index] = strdup(name_string);
    free(name_string);

    /* Copy the prefix-directory specified in the
       corresponding app_context.  If there are multiple,
       different prefix's in the app context, complain (i.e., only
       allow one --prefix option for the entire slurm run -- we
       don't support different --prefix'es for different nodes in
       the SLURM plm) */
    cur_prefix = NULL;
    for (n=0; n < jdata->num_apps; n++) {
        char * app_prefix_dir = apps[n]->prefix_dir;
         /* Check for already set cur_prefix_dir -- if different,
           complain */
        if (NULL != app_prefix_dir) {
            if (NULL != cur_prefix &&
                0 != strcmp (cur_prefix, app_prefix_dir)) {
                orte_show_help("help-plm-slurm.txt", "multiple-prefixes",
                               true, cur_prefix, app_prefix_dir);
                return ORTE_ERR_FATAL;
            }

            /* If not yet set, copy it; iff set, then it's the
             * same anyway
             */
            if (NULL == cur_prefix) {
                cur_prefix = strdup(app_prefix_dir);
                OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                     "%s plm:slurm: Set prefix:%s",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     cur_prefix));
            }
        }
    }

    /* setup environment */
    env = opal_argv_copy(orte_launch_environ);

    /* enable local launch by the orteds */
    var = mca_base_param_environ_variable("plm", NULL, NULL);
    opal_setenv(var, "rsh", true, &env);
    free(var);
    
    /* if we can do it, use the regexp to launch the apps - this
     * requires that the user requested this mode, that we were
     * provided with static ports, and that we only have one
     * app_context
     */
    if (orte_use_regexp && orte_static_ports && jdata->num_apps < 2) {
        char *regexp;
        regexp = orte_regex_encode_maps(jdata);
        opal_argv_append(&argc, &argv, "--launch");
        opal_argv_append(&argc, &argv, regexp);
        free(regexp);
        using_regexp = true;
    }
    
    if (0 < opal_output_get_verbosity(orte_plm_globals.output)) {
        param = opal_argv_join(argv, ' ');
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:slurm: final top-level argv:\n\t%s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             (NULL == param) ? "NULL" : param));
        if (NULL != param) free(param);
    }
    
    /* exec the daemon(s) */
    if (ORTE_SUCCESS != (rc = plm_slurm_start_proc(argc, argv, env, cur_prefix))) {
        ORTE_ERROR_LOG(rc);
        goto cleanup;
    }
    
    /* do NOT wait for srun to complete. Srun only completes when the processes
     * it starts - in this case, the orteds - complete. Instead, we'll catch
     * any srun failures and deal with them elsewhere
     */
    
    /* wait for daemons to callback */
    if (ORTE_SUCCESS != (rc = orte_plm_base_daemon_callback(map->num_new_daemons))) {
        OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                             "%s plm:slurm: daemon launch failed for job %s on error %s",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
        goto cleanup;
    }
    
launch_apps:
    /* get here if daemons launch okay - any failures now by apps */
    launching_daemons = false;
    failed_job = active_job;
    if (using_regexp) {
        /* daemons already have launch cmd - just wait for them to
         * report back
         */
        opal_buffer_t launch;
        int8_t flag;
        orte_daemon_cmd_flag_t command = ORTE_DAEMON_ADD_LOCAL_PROCS;
        OBJ_CONSTRUCT(&launch, opal_buffer_t);
        opal_dss.pack(&launch, &command, 1, ORTE_DAEMON_CMD);
        flag = 1;
        opal_dss.pack(&launch, &flag, 1, OPAL_INT8);
        opal_dss.pack(&launch, &orted_launch_cmd, 1, OPAL_STRING);
        ORTE_MESSAGE_EVENT(ORTE_PROC_MY_NAME, &launch, ORTE_RML_TAG_DAEMON, orte_daemon_cmd_processor);
        OBJ_DESTRUCT(&launch);

        if (ORTE_SUCCESS != (rc = orte_plm_base_report_launched(jdata->jobid))) {
            OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
                                 "%s plm:slurm:launch failed for job %s on error %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(jdata->jobid), ORTE_ERROR_NAME(rc)));
            goto cleanup;
        }
    } else {
        if (ORTE_SUCCESS != (rc = orte_plm_base_launch_apps(active_job))) {
            OPAL_OUTPUT_VERBOSE((1, orte_plm_globals.output,
                                 "%s plm:slurm: launch of apps failed for job %s on error %s",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_JOBID_PRINT(active_job), ORTE_ERROR_NAME(rc)));
            goto cleanup;
        }
    }

    /* declare the launch a success */
    failed_launch = false;
    
    if (orte_timing) {
        if (0 != gettimeofday(&launchstop, NULL)) {
             opal_output(0, "plm_slurm: could not obtain stop time");
         } else {
             opal_output(0, "plm_slurm: total job launch time is %ld usec",
                         (launchstop.tv_sec - launchstart.tv_sec)*1000000 + 
                         (launchstop.tv_usec - launchstart.tv_usec));
         }
    }

    if (ORTE_SUCCESS != rc) {
        opal_output(0, "plm:slurm: start_procs returned error %d", rc);
        goto cleanup;
    }

cleanup:
    if (NULL != argv) {
        opal_argv_free(argv);
    }
    if (NULL != env) {
        opal_argv_free(env);
    }
    
    if(NULL != jobid_string) {
        free(jobid_string);
    }
    
    /* check for failed launch - if so, force terminate */
    if (failed_launch) {
        orte_plm_base_launch_failed(failed_job, -1, ORTE_ERROR_DEFAULT_EXIT_CODE, ORTE_JOB_STATE_FAILED_TO_START);
    }
    
    return rc;
}
コード例 #13
0
ファイル: pml_yalla_datatype.c プロジェクト: Cai900205/test
static void mca_pml_yalla_convertor_construct(mca_pml_yalla_convertor_t *convertor)
{
    OBJ_CONSTRUCT(&convertor->convertor, opal_convertor_t);
}
コード例 #14
0
ファイル: pmix_native.c プロジェクト: htquach/gemeter
static bool native_get_attr(const char *attr, opal_value_t **kv)
{
    opal_buffer_t *msg, *bptr;
    opal_list_t vals;
    opal_value_t *kp, *lclpeers=NULL, kvn;
    pmix_cmd_t cmd = PMIX_GETATTR_CMD;
    char **ranks;
    int rc, ret;
    int32_t cnt;
    bool found=false;
    opal_hwloc_locality_t locality;
    pmix_cb_t *cb;
    uint32_t i, myrank;
    opal_process_name_t id;
    char *cpuset;
    opal_buffer_t buf, buf2;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:native get_attr called",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));

    /* try to retrieve the requested value from the dstore */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == opal_dstore.fetch(opal_dstore_internal, &OPAL_PROC_MY_NAME, attr, &vals)) {
        *kv = (opal_value_t*)opal_list_remove_first(&vals);
        OPAL_LIST_DESTRUCT(&vals);
        return true;
    }

    if (NULL == mca_pmix_native_component.uri) {
        /* no server available, so just return */
        return false;
    }

    /* if the value isn't yet available, then we should try to retrieve
     * all the available attributes and store them for future use */
    msg = OBJ_NEW(opal_buffer_t);
    /* pack the cmd */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &cmd, 1, PMIX_CMD_T))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return false;
    }

    /* create a callback object as we need to pass it to the
     * recv routine so we know which callback to use when
     * the return message is recvd */
    cb = OBJ_NEW(pmix_cb_t);
    cb->active = true;

    /* push the message into our event base to send to the server */
    PMIX_ACTIVATE_SEND_RECV(msg, wait_cbfunc, cb);

    /* wait for the data to return */
    PMIX_WAIT_FOR_COMPLETION(cb->active);

    /* we have received the entire data blob for this process - unpack
     * and cache all values, keeping the one we requested to return
     * to the caller */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &ret, &cnt, OPAL_INT))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(cb);
        return false;
    }
    if (OPAL_SUCCESS == ret) {
        /* unpack the buffer containing the values */
        cnt = 1;
        if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
            OPAL_ERROR_LOG(rc);
            OBJ_RELEASE(cb);
            return false;
        }
        cnt = 1;
        while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s unpacked attr %s",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key);
            /* if this is the local topology, we need to save it in a special way */
#if OPAL_HAVE_HWLOC
            {
                hwloc_topology_t topo;
                if (0 == strcmp(PMIX_LOCAL_TOPO, kp->key)) {
                    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                        "%s saving topology",
                                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                    /* transfer the byte object for unpacking */
                    OBJ_CONSTRUCT(&buf, opal_buffer_t);
                    opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                    kp->data.bo.bytes = NULL;  // protect the data region
                    kp->data.bo.size = 0;
                    OBJ_RELEASE(kp);
                    /* extract the topology */
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &topo, &cnt, OPAL_HWLOC_TOPO))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        continue;
                    }
                    OBJ_DESTRUCT(&buf);
                    if (NULL == opal_hwloc_topology) {
                        opal_hwloc_topology = topo;
                    } else {
                        hwloc_topology_destroy(topo);
                    }
                    cnt = 1;
                    continue;
                }
            }
#endif
            /* if this is the local cpuset blob, then unpack and store its contents */
            if (0 == strcmp(PMIX_LOCAL_CPUSETS, kp->key)) {
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s received local cpusets",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                /* transfer the byte object for unpacking */
                OBJ_CONSTRUCT(&buf, opal_buffer_t);
                opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                kp->data.bo.bytes = NULL;  // protect the data region
                kp->data.bo.size = 0;
                OBJ_RELEASE(kp);
                cnt=1;
                while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &id, &cnt, OPAL_NAME))) {
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &cpuset, &cnt, OPAL_STRING))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        cnt = 1;
                        continue;
                    }
                    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                        "%s saving cpuset %s for local peer %s",
                                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                        (NULL == cpuset) ? "NULL" : cpuset,
                                        OPAL_NAME_PRINT(id));
                    OBJ_CONSTRUCT(&kvn, opal_value_t);
                    kvn.key = strdup(OPAL_DSTORE_CPUSET);
                    kvn.type = OPAL_STRING;
                    kvn.data.string = cpuset;
                    if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, &kvn))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&kvn);
                        cnt = 1;
                        continue;
                    }
                    OBJ_DESTRUCT(&kvn);
                }
                OBJ_DESTRUCT(&buf);
                if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                    OPAL_ERROR_LOG(rc);
                    return false;
                }
                cnt=1;
                continue;
            } else if (0 == strcmp(PMIX_PROC_MAP, kp->key)) {
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s received proc map",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
                /* transfer the byte object for unpacking */
                OBJ_CONSTRUCT(&buf, opal_buffer_t);
                opal_dss.load(&buf, kp->data.bo.bytes, kp->data.bo.size);
                kp->data.bo.bytes = NULL;  // protect the data region
                kp->data.bo.size = 0;
                OBJ_RELEASE(kp);
                /* get the jobid */
                cnt=1;
                if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                    OPAL_ERROR_LOG(rc);
                    OBJ_DESTRUCT(&buf);
                    cnt = 1;
                    return false;
                }
                if (0 != strcmp(PMIX_JOBID, kp->key)) {
                    OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                    OBJ_DESTRUCT(&buf);
                    OBJ_RELEASE(kp);
                    cnt = 1;
                    return false;
                }
                id.jobid = kp->data.uint32;
                OBJ_RELEASE(kp);
                /* unpack the data for each rank */
                cnt=1;
                while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                    if (0 != strcmp(PMIX_RANK, kp->key)) {
                        OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                        OBJ_DESTRUCT(&buf);
                        OBJ_RELEASE(kp);
                        cnt = 1;
                        return false;
                    }
                    id.vpid = kp->data.uint32;
                    /* unpack the blob for this rank */
                    cnt=1;
                    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&buf, &kp, &cnt, OPAL_VALUE))) {
                        OPAL_ERROR_LOG(rc);
                        OBJ_DESTRUCT(&buf);
                        cnt = 1;
                        return false;
                    }
                    if (0 != strcmp(PMIX_PROC_MAP, kp->key)) {
                        OPAL_ERROR_LOG(OPAL_ERR_BAD_PARAM);
                        OBJ_DESTRUCT(&buf);
                        OBJ_RELEASE(kp);
                        cnt = 1;
                        return false;
                    }
                    /* transfer the byte object for unpacking */
                    OBJ_CONSTRUCT(&buf2, opal_buffer_t);
                    opal_dss.load(&buf2, kp->data.bo.bytes, kp->data.bo.size);
                    kp->data.bo.bytes = NULL;  // protect the data region
                    kp->data.bo.size = 0;
                    OBJ_RELEASE(kp);
                    /* unpack and store the map */
                    cnt=1;
                    while (OPAL_SUCCESS == (rc = opal_dss.unpack(&buf2, &kp, &cnt, OPAL_VALUE))) {
                        opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                            "%s storing key %s for peer %s",
                                            OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                            kp->key, OPAL_NAME_PRINT(id));
                        if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &id, kp))) {
                            OPAL_ERROR_LOG(rc);
                            OBJ_RELEASE(kp);
                            OBJ_DESTRUCT(&buf2);
                            return false;
                        }
                    }
                    OBJ_DESTRUCT(&buf2);
                    if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                        OPAL_ERROR_LOG(rc);
                        return false;
                    }
                    cnt=1;
                }
                OBJ_DESTRUCT(&buf);
                if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
                    OPAL_ERROR_LOG(rc);
                    return false;
                }
                cnt=1;
                continue;
            }
            /* otherwise, it is a single piece of info, so store it */
            if (OPAL_SUCCESS != (rc = opal_dstore.store(opal_dstore_internal, &OPAL_PROC_MY_NAME, kp))) {
                OPAL_ERROR_LOG(rc);
                OBJ_RELEASE(kp);
                cnt = 1;
                continue;
            }
            /* save the list of local peers */
            if (0 == strcmp(PMIX_LOCAL_PEERS, kp->key)) {
                OBJ_RETAIN(kp);
                lclpeers = kp;
                opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                    "%s saving local peers %s",
                                    OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), lclpeers->data.string);
            } else if (0 == strcmp(PMIX_JOBID, kp->key)) {
                native_pname.jobid = kp->data.uint32;
            } else if (0 == strcmp(PMIX_RANK, kp->key)) {
                native_pname.vpid = kp->data.uint32;
            }
            if (0 == strcmp(attr, kp->key)) {
                OBJ_RETAIN(kp);
                *kv = kp;
                found = true;
            }
            OBJ_RELEASE(kp);
            cnt = 1;
        }
        OBJ_RELEASE(bptr);
        if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            OPAL_ERROR_LOG(rc);
            return false;
        }
    } else {
        OPAL_ERROR_LOG(ret);
        OBJ_RELEASE(cb);
        return false;
    }
    OBJ_RELEASE(cb);
    opal_proc_set_name(&native_pname);

    /* if the list of local peers wasn't included, then we are done */
    if (NULL == lclpeers) {
        opal_output_verbose(0, opal_pmix_base_framework.framework_output,
                            "%s no local peers reported",
                            OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
        return found;
    }

    /* baseline all the procs as nonlocal */
    myrank = native_pname.vpid;
    id.jobid = native_pname.jobid;

#if OPAL_HAVE_HWLOC
    /* fetch my cpuset */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == (rc = opal_dstore.fetch(opal_dstore_internal, &native_pname,
                                                OPAL_DSTORE_CPUSET, &vals))) {
        kp = (opal_value_t*)opal_list_get_first(&vals);
        cpuset = strdup(kp->data.string);
    } else {
        cpuset = NULL;
    }
    OPAL_LIST_DESTRUCT(&vals);
#endif

    /* we only need to set locality for each local rank as "not found"
     * equates to "non local" */
    ranks = opal_argv_split(lclpeers->data.string, ',');
    for (i=0; NULL != ranks[i]; i++) {
        uint32_t vid = strtoul(ranks[i], NULL, 10);
        if (myrank == vid) {
            continue;
        }
        id.vpid = vid;
#if OPAL_HAVE_HWLOC
        OBJ_CONSTRUCT(&vals, opal_list_t);
        if (OPAL_SUCCESS != (rc = opal_dstore.fetch(opal_dstore_internal, &id,
                                                    OPAL_DSTORE_CPUSET, &vals))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s cpuset for local proc %s not found",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                                OPAL_NAME_PRINT(id));
            OPAL_LIST_DESTRUCT(&vals);
            /* even though the cpuset wasn't found, we at least know it is
             * on the same node with us */
            locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
        } else {
            kp = (opal_value_t*)opal_list_get_first(&vals);
            if (NULL == kp->data.string) {
                /* if we share a node, but we don't know anything more, then
                 * mark us as on the node as this is all we know
                 */
                locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
            } else {
                /* determine relative location on our node */
                locality = opal_hwloc_base_get_relative_locality(opal_hwloc_topology,
                                                                 cpuset,
                                                                 kp->data.string);
            }
            OPAL_LIST_DESTRUCT(&vals);
        }
#else
        /* all we know is we share a node */
        locality = OPAL_PROC_ON_CLUSTER | OPAL_PROC_ON_CU | OPAL_PROC_ON_NODE;
#endif
        OPAL_OUTPUT_VERBOSE((1, opal_pmix_base_framework.framework_output,
                             "%s pmix:native proc %s locality %s",
                             OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                             OPAL_NAME_PRINT(id),
                             opal_hwloc_base_print_locality(locality)));

        OBJ_CONSTRUCT(&kvn, opal_value_t);
        kvn.key = strdup(OPAL_DSTORE_LOCALITY);
        kvn.type = OPAL_UINT16;
        kvn.data.uint16 = locality;
        (void)opal_dstore.store(opal_dstore_internal, &id, &kvn);
        OBJ_DESTRUCT(&kvn);
    }
#if OPAL_HAVE_HWLOC
    if (NULL != cpuset) {
        free(cpuset);
    }
#endif
    opal_argv_free(ranks);

    return found;
}
コード例 #15
0
ファイル: rmaps_lb.c プロジェクト: 315234/OpenFOAM-2.2.x-OSX
/* place specified #procs on each node, up to the specified total
 * number of procs (if one was given).
 */
static int npernode(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i, j, rc=ORTE_SUCCESS;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_node_t *node;
    int np, nprocs;
    int num_nodes;
    
    /* setup the node list */
    OBJ_CONSTRUCT(&node_list, opal_list_t);
   
    /* loop through the app_contexts */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* use the number of procs if one was given */
        if (0 < app->num_procs) {
            np = app->num_procs;
        } else {
            np = INT_MAX;
        }
        /* for each app_context, we have to get the list of nodes that it can
         * use since that can now be modified with a hostfile and/or -host
         * option
         */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->policy))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* loop through the list of nodes */
        num_nodes = opal_list_get_size(&node_list);
        nprocs = 0;
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            node = (orte_node_t*)item;
            /* put the specified number of procs on each node */
            for (j=0; j < jdata->map->npernode && nprocs < np; j++) {
                if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                     jdata->map->cpus_per_rank, app->idx,
                                                                     &node_list, jdata->map->oversubscribe,
                                                                     false, NULL))) {
                    /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
                     * more procs to place, then that is an error
                     */
                    if (ORTE_ERR_NODE_FULLY_USED != rc ||
                        j < jdata->map->npernode-1) {
                        ORTE_ERROR_LOG(rc);
                        OBJ_RELEASE(node);
                        goto error;
                    }
                }
                nprocs++;
            }
            OBJ_RELEASE(node);
        }
        /* update the number of procs in the job */
        jdata->num_procs += nprocs;
        /* if the user requested a specific number of procs and
         * the total number of procs we were able to assign
         * doesn't equal the number requested, then we have a
         * problem
         */
        if (0 < app->num_procs && nprocs < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
                           app->app, app->num_procs,
                           "number of nodes", num_nodes,
                           "npernode", jdata->map->npernode);
            return ORTE_ERR_SILENT;
        }
        /* compute vpids and add proc objects to the job - this has to be
         * done after each app_context is mapped in order to keep the
         * vpids contiguous within an app_context
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }

error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
コード例 #16
0
static int rte_init(char flags)
{
    int ret;
    char *error = NULL;
    orte_jmap_t *jmap;

    /* run the prolog */
    if (ORTE_SUCCESS != (ret = orte_ess_base_std_prolog())) {
        error = "orte_ess_base_std_prolog";
        goto error;
    }
    
    /* Start by getting a unique name */
    slurm_set_name();
    
    /* if I am a daemon, complete my setup using the
     * default procedure
     */
    if (orte_process_info.daemon) {
        if (ORTE_SUCCESS != (ret = orte_ess_base_orted_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_orted_setup";
            goto error;
        }
    } else if (orte_process_info.tool) {
        /* otherwise, if I am a tool proc, use that procedure */
        if (ORTE_SUCCESS != (ret = orte_ess_base_tool_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_tool_setup";
            goto error;
        }
    } else {
        /* otherwise, I must be an application process - use
         * the default procedure to finish my setup
         */
        if (ORTE_SUCCESS != (ret = orte_ess_base_app_setup())) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_app_setup";
            goto error;
        }
        
        /* setup the nidmap arrays */
        OBJ_CONSTRUCT(&nidmap, opal_pointer_array_t);
        opal_pointer_array_init(&nidmap, 8, INT32_MAX, 8);
        
        /* setup array of jmaps */
        OBJ_CONSTRUCT(&jobmap, opal_pointer_array_t);
        opal_pointer_array_init(&jobmap, 1, INT32_MAX, 1);
        jmap = OBJ_NEW(orte_jmap_t);
        jmap->job = ORTE_PROC_MY_NAME->jobid;
        opal_pointer_array_add(&jobmap, jmap);
        
        /* if one was provided, build my nidmap */
        if (ORTE_SUCCESS != (ret = orte_ess_base_build_nidmap(orte_process_info.sync_buf,
                                                              &nidmap, &jmap->pmap, &nprocs))) {
            ORTE_ERROR_LOG(ret);
            error = "orte_ess_base_build_nidmap";
            goto error;
        }
    }
    
    return ORTE_SUCCESS;
    
error:
    orte_show_help("help-orte-runtime.txt",
                   "orte_init:startup:internal-failure",
                   true, error, ORTE_ERROR_NAME(ret), ret);
    
    return ret;
}
コード例 #17
0
static void proc_data_construct(opal_dstore_proc_data_t *ptr)
{
    ptr->loaded = false;
    OBJ_CONSTRUCT(&ptr->data, opal_list_t);
}
コード例 #18
0
ファイル: oob_stress.c プロジェクト: bringhurst/ompi
int
main(int argc, char *argv[]){
    int count;
    int msgsize;
    uint8_t *msg;
    int i, j, rc;
    orte_process_name_t peer;
    double maxpower;
    
    /*
     * Init
     */
    orte_init(&argc, &argv, ORTE_PROC_NON_MPI);

    if (argc > 1) {
        count = atoi(argv[1]);
        if (count < 0) {
            count = INT_MAX-1;
        }
    } else {
        count = MAX_COUNT;
    }
    
    peer.jobid = ORTE_PROC_MY_NAME->jobid;
    
    for (j=1; j < count+1; j++) {
        peer.vpid = (ORTE_PROC_MY_NAME->vpid + j) % orte_process_info.num_procs;
        ORTE_EPOCH_SET(peer.epoch,orte_ess.proc_get_epoch(&peer));
        
        /* rank0 starts ring */
        if (ORTE_PROC_MY_NAME->vpid == 0) {
            /* setup the initiating buffer - put random sized message in it */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            
            maxpower = (double)(j%7);
            msgsize = (int)pow(10.0, maxpower);
            opal_output(0, "Ring %d message size %d bytes", j, msgsize);
            msg = (uint8_t*)malloc(msgsize);
            opal_dss.pack(&buf, msg, msgsize, OPAL_BYTE);
            
            if (0 > (rc = orte_rml.send_buffer(&peer,&buf, MY_TAG, 0))) {
                opal_output(0, "error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
            /* wait for it to come around */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            opal_output(0, "%s Ring %d completed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), j);
        } else {
            /* wait for msg */
            OBJ_CONSTRUCT(&buf, opal_buffer_t);
            msg_recvd = false;
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, MY_TAG,
                                    ORTE_RML_NON_PERSISTENT, recv_ack, NULL);
            
            ORTE_PROGRESSED_WAIT(msg_recvd, 0, 1);
            /* send it along */
            if (0 > (rc = orte_rml.send_buffer(&peer, &buf, MY_TAG, 0))) {
                opal_output(0, "%s error sending to %s %s\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            ORTE_NAME_PRINT(&peer), ORTE_ERROR_NAME(rc));
                exit(1);
            }
            OBJ_DESTRUCT(&buf);
        }
    }

    orte_finalize();

    return 0;
}
コード例 #19
0
ファイル: comm_dyn.c プロジェクト: meghnave/SpherePacking
int
ompi_comm_start_processes(int count, char **array_of_commands,
                          char ***array_of_argv,
                          int *array_of_maxprocs,
                          MPI_Info *array_of_info,
                          char *port_name)
{
    int rc, i, j, counter;
    int have_wdir=0;
    bool have_prefix;
    int valuelen=OMPI_PATH_MAX, flag=0;
    char cwd[OMPI_PATH_MAX];
    char host[OMPI_PATH_MAX];  /*** should define OMPI_HOST_MAX ***/
    char prefix[OMPI_PATH_MAX];
    char *base_prefix;

    orte_std_cntr_t num_apps, ai;
    orte_jobid_t new_jobid=ORTE_JOBID_INVALID;
    orte_app_context_t **apps=NULL;
    
    opal_list_t attributes;
    opal_list_item_t *item;

    bool timing = false;
    struct timeval ompistart, ompistop;
    int param, value;
    
    /* parse the info object */
    /* check potentially for:
       - "host": desired host where to spawn the processes
       - "prefix": the path to the root of the directory tree where ompi
                   executables and libraries can be found
       - "arch": desired architecture
       - "wdir": directory, where executable can be found
       - "path": list of directories where to look for the executable
       - "file": filename, where additional information is provided.
       - "soft": see page 92 of MPI-2.
    */

    /* make sure the progress engine properly trips the event library */
    opal_progress_event_increment();
    
    /* check to see if we want timing information */
    param = mca_base_param_reg_int_name("ompi", "timing",
                                        "Request that critical timing loops be measured",
                                        false, false, 0, &value);
    if (value != 0) {
        timing = true;
        if (0 != gettimeofday(&ompistart, NULL)) {
            opal_output(0, "ompi_comm_start_procs: could not obtain start time");
            ompistart.tv_sec = 0;
            ompistart.tv_usec = 0;
        }
    }
    
    /* setup to record the attributes */
    OBJ_CONSTRUCT(&attributes, opal_list_t);
    
    /* we want to be able to default the prefix to the one used for this job
     * so that the ompi executables and libraries can be found. the user can
     * later override this value by providing an MPI_Info value. for now, though,
     * let's get the default value off the registry
     */
    if (ORTE_SUCCESS != (rc = orte_rmgr.get_app_context(orte_process_info.my_name->jobid, &apps, &num_apps))) {
        ORTE_ERROR_LOG(rc);
        return rc;
    }
    /* we'll just use the prefix from the first member of the app_context array.
     * this shouldn't matter as they all should be the same. it could be NULL, of
     * course (user might not have specified it), so we need to protect against that.
     *
     * It's possible that no app_contexts are returned (e.g., during a comm_spawn
     * from a singleton), so check first
     */
    if (NULL != apps && NULL != apps[0]->prefix_dir) {
        base_prefix = strdup(apps[0]->prefix_dir);
    } else {
        base_prefix = NULL;
    }
    /* cleanup the memory we used */
    for (ai = 0; ai < num_apps; ai++) {
        OBJ_RELEASE(apps[ai]);
    }
    if (NULL != apps) free(apps);

    /* Convert the list of commands to an array of orte_app_context_t
       pointers */
    apps = (orte_app_context_t**)malloc(count * sizeof(orte_app_context_t *));
    if (NULL == apps) {
        ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
        return ORTE_ERR_OUT_OF_RESOURCE;
    }
    for (i = 0; i < count; ++i) {
        apps[i] = OBJ_NEW(orte_app_context_t);
        if (NULL == apps[i]) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            /* rollback what was already done */
            for (j=0; j < i; j++) OBJ_RELEASE(apps[j]);
            opal_progress_event_decrement();
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        /* copy over the name of the executable */
        apps[i]->app = strdup(array_of_commands[i]);
        if (NULL == apps[i]->app) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            /* rollback what was already done */
            for (j=0; j < i; j++) OBJ_RELEASE(apps[j]);
            opal_progress_event_decrement();
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        /* record the number of procs to be generated */
        apps[i]->num_procs = array_of_maxprocs[i];

        /* copy over the argv array */
        counter = 1;

        if (MPI_ARGVS_NULL != array_of_argv &&
            MPI_ARGV_NULL != array_of_argv[i]) {
            /* first need to find out how many entries there are */
            j=0;
            while (NULL != array_of_argv[i][j]) {
                j++;
            }
            counter += j;
        }

        /* now copy them over, ensuring to NULL terminate the array */
        apps[i]->argv = (char**)malloc((1 + counter) * sizeof(char*));
        if (NULL == apps[i]->argv) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            /* rollback what was already done */
            for (j=0; j < i; j++) {
                OBJ_RELEASE(apps[j]);
            }
            opal_progress_event_decrement();
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        apps[i]->argv[0] = strdup(array_of_commands[i]);
        for (j=1; j < counter; j++) {
            apps[i]->argv[j] = strdup(array_of_argv[i][j-1]);
        }
        apps[i]->argv[counter] = NULL;


        /* the environment gets set by the launcher
         * all we need to do is add the specific values
         * needed for comm_spawn
         */
        /* Add environment variable with the contact information for the
           child processes.
        */
        counter = 1;
        apps[i]->env = (char**)malloc((1+counter) * sizeof(char*));
        if (NULL == apps[i]->env) {
            ORTE_ERROR_LOG(ORTE_ERR_OUT_OF_RESOURCE);
            /* rollback what was already done */
            for (j=0; j < i; j++) OBJ_RELEASE(apps[j]);
            opal_progress_event_decrement();
            return ORTE_ERR_OUT_OF_RESOURCE;
        }
        asprintf(&(apps[i]->env[0]), "OMPI_PARENT_PORT=%s", port_name);
        apps[i]->env[1] = NULL;
        for (j = 0; NULL != environ[j]; ++j) {
            if (0 == strncmp("OMPI_", environ[j], 5)) {
                opal_argv_append_nosize(&apps[i]->env, environ[j]);
            }
        }

        /* Check for well-known info keys */
        have_wdir = 0;
        have_prefix = false;
        if ( array_of_info != NULL && array_of_info[i] != MPI_INFO_NULL ) {

            /* check for 'wdir' */ 
            ompi_info_get (array_of_info[i], "wdir", valuelen, cwd, &flag);
            if ( flag ) {
                apps[i]->cwd = cwd;
                have_wdir = 1;
            }

            /* check for 'host' */
            ompi_info_get (array_of_info[i], "host", sizeof(host), host, &flag);
            if ( flag ) {
                apps[i]->num_map = 1;
                apps[i]->map_data = (orte_app_context_map_t **) malloc(sizeof(orte_app_context_map_t *));
                apps[i]->map_data[0] = OBJ_NEW(orte_app_context_map_t);
                apps[i]->map_data[0]->map_type = ORTE_APP_CONTEXT_MAP_HOSTNAME;
                apps[i]->map_data[0]->map_data = strdup(host);
            }
 
            /* 'path', 'arch', 'file', 'soft' -- to be implemented */ 
            
            /* check for 'ompi_prefix' (OMPI-specific -- to effect the same
             * behavior as --prefix option to orterun)
             */
            ompi_info_get (array_of_info[i], "ompi_prefix", sizeof(prefix), prefix, &flag);
            if ( flag ) {
                apps[i]->prefix_dir = strdup(prefix);
                have_prefix = true;
            }
        }

        /* default value: If the user did not tell us where to look for the
           executable, we assume the current working directory */
        if ( !have_wdir ) {
            getcwd(cwd, OMPI_PATH_MAX);
            apps[i]->cwd = strdup(cwd);
        }
        
        /* if the user told us a new prefix, then we leave it alone. otherwise, if
         * a prefix had been provided before, copy that one into the new app_context
         * for use by the spawned children
         */
        if ( !have_prefix && NULL != base_prefix) {
            apps[i]->prefix_dir = strdup(base_prefix);
        }
        
        /* leave the map info alone - the launcher will
         * decide where to put things
         */
    } /* for (i = 0 ; i < count ; ++i) */

    /* cleanup */
    if (NULL != base_prefix) free(base_prefix);

    /* tell the RTE that we want to the children to run inside of our allocation -
     * don't go get one just for them
     */
    if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_RAS_USE_PARENT_ALLOCATION,
                                                      ORTE_JOBID, &(orte_process_info.my_name->jobid),
                                                      ORTE_RMGR_ATTR_OVERRIDE))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&attributes);
        opal_progress_event_decrement();
        return MPI_ERR_SPAWN;
    }

    /* tell the RTE that we want the children mapped the same way as their parent */
    if (ORTE_SUCCESS != (rc = orte_rmgr.add_attribute(&attributes, ORTE_RMAPS_USE_PARENT_PLAN,
                                                      ORTE_JOBID, &(orte_process_info.my_name->jobid),
                                                      ORTE_RMGR_ATTR_OVERRIDE))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&attributes);
        opal_progress_event_decrement();
        return MPI_ERR_SPAWN;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing) {
        if (0 != gettimeofday(&ompistop, NULL)) {
            opal_output(0, "ompi_comm_start_procs: could not obtain stop time");
        } else {
            opal_output(0, "ompi_comm_start_procs: time from start to prepare to spawn %ld usec",
                        (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                                   (ompistop.tv_usec - ompistart.tv_usec)));
            if (0 != gettimeofday(&ompistart, NULL)) {
                opal_output(0, "ompi_comm_start_procs: could not obtain new start time");
                ompistart.tv_sec = ompistop.tv_sec;
                ompistart.tv_usec = ompistop.tv_usec;
            }
        }
    }
    
    /* spawn procs */
    if (ORTE_SUCCESS != (rc = orte_rmgr.spawn_job(apps, count, &new_jobid, 0, NULL, NULL, ORTE_PROC_STATE_NONE, &attributes))) {
        ORTE_ERROR_LOG(rc);
        opal_progress_event_decrement();
        return MPI_ERR_SPAWN;
    }

    /* check for timing request - get stop time and report elapsed time if so */
    if (timing) {
        if (0 != gettimeofday(&ompistop, NULL)) {
            opal_output(0, "ompi_comm_start_procs: could not obtain stop time");
        } else {
            opal_output(0, "ompi_comm_start_procs: time to spawn %ld usec",
                        (long int)((ompistop.tv_sec - ompistart.tv_sec)*1000000 +
                                   (ompistop.tv_usec - ompistart.tv_usec)));
         }
    }
    
    /* clean up */
    opal_progress_event_decrement();
    while (NULL != (item = opal_list_remove_first(&attributes))) OBJ_RELEASE(item);
    OBJ_DESTRUCT(&attributes);

    for ( i=0; i<count; i++) {
    OBJ_RELEASE(apps[i]);
    }
    free (apps);

    return OMPI_SUCCESS;
}
コード例 #20
0
/*
 * Traverse the entire list of found components (a list of
 * mca_base_component_t instances).  If the requested_component_names
 * array is empty, or the name of each component in the list of found
 * components is in the requested_components_array, try to open it.
 * If it opens, add it to the components_available list.
 */
static int register_components(const char *project_name, const char *type_name,
                               int output_id, opal_list_t *src, opal_list_t *dest)
{
    int ret;
    opal_list_item_t *item;
    mca_base_component_t *component;
    mca_base_component_list_item_t *cli;
    
    /* Announce */
    opal_output_verbose(10, output_id,
                        "mca: base: components_register: registering %s components",
                        type_name);
    
    /* Traverse the list of found components */
    
    OBJ_CONSTRUCT(dest, opal_list_t);
    while (NULL != (item = opal_list_remove_first (src))) {
        cli = (mca_base_component_list_item_t *) item;
        component = (mca_base_component_t *)cli->cli_component;

        opal_output_verbose(10, output_id, 
                            "mca: base: components_register: found loaded component %s",
                            component->mca_component_name);

        /* Call the component's MCA parameter registration function (or open if register doesn't exist) */
        if (NULL == component->mca_register_component_params) {
            opal_output_verbose(10, output_id, 
                                "mca: base: components_register: "
                                "component %s has no register or open function",
                                component->mca_component_name);
            ret = OPAL_SUCCESS;
        } else {
            ret = component->mca_register_component_params();
        }

        if (OPAL_SUCCESS != ret) {
            if (OPAL_ERR_NOT_AVAILABLE != ret) {
                /* If the component returns OPAL_ERR_NOT_AVAILABLE,
                   it's a cue to "silently ignore me" -- it's not a
                   failure, it's just a way for the component to say
                   "nope!".  
 
                   Otherwise, however, display an error.  We may end
                   up displaying this twice, but it may go to separate
                   streams.  So better to be redundant than to not
                   display the error in the stream where it was
                   expected. */
                
                if (mca_base_component_show_load_errors) {
                    opal_output(0, "mca: base: components_register: "
                                "component %s / %s register function failed",
                                component->mca_type_name,
                                component->mca_component_name);
                }

                opal_output_verbose(10, output_id, 
                                    "mca: base: components_register: "
                                    "component %s register function failed",
                                    component->mca_component_name);
            }

            mca_base_component_unload (component, output_id);

            /* Release this list item */
            OBJ_RELEASE(cli);
            continue;
        }

        if (NULL != component->mca_register_component_params) {
            opal_output_verbose (10, output_id, "mca: base: components_register: "
                                 "component %s register function successful",
                                 component->mca_component_name);
        }

        /* Register this component's version */
        mca_base_var_register (project_name, type_name, component->mca_component_name, "major_version",
                               NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY |
                               MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT,
                               &component->mca_component_major_version);
        mca_base_var_register (project_name, type_name, component->mca_component_name, "minor_version",
                               NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY |
                               MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT,
                               &component->mca_component_minor_version);
        mca_base_var_register (project_name, type_name, component->mca_component_name, "release_version",
                               NULL, MCA_BASE_VAR_TYPE_INT, NULL, 0, MCA_BASE_VAR_FLAG_DEFAULT_ONLY |
                               MCA_BASE_VAR_FLAG_INTERNAL, OPAL_INFO_LVL_9, MCA_BASE_VAR_SCOPE_CONSTANT,
                               &component->mca_component_release_version);
        
        opal_list_append(dest, item);
    }
    
    /* All done */
    
    return OPAL_SUCCESS;
}
コード例 #21
0
ファイル: orted_main.c プロジェクト: anandhis/ompi
int orte_daemon(int argc, char *argv[])
{
    int ret = 0;
    opal_cmd_line_t *cmd_line = NULL;
    int i;
    opal_buffer_t *buffer;
    char hostname[OPAL_MAXHOSTNAMELEN];
#if OPAL_ENABLE_FT_CR == 1
    char *tmp_env_var = NULL;
#endif

    /* initialize the globals */
    memset(&orted_globals, 0, sizeof(orted_globals));
    /* initialize the singleton died pipe to an illegal value so we can detect it was set */
    orted_globals.singleton_died_pipe = -1;
    bucket = OBJ_NEW(opal_buffer_t);

    /* setup to check common command line options that just report and die */
    cmd_line = OBJ_NEW(opal_cmd_line_t);
    if (OPAL_SUCCESS != opal_cmd_line_create(cmd_line, orte_cmd_line_opts)) {
        OBJ_RELEASE(cmd_line);
        exit(1);
    }
    mca_base_cmd_line_setup(cmd_line);
    if (ORTE_SUCCESS != (ret = opal_cmd_line_parse(cmd_line, false, false,
                                                   argc, argv))) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        fprintf(stderr, "Usage: %s [OPTION]...\n%s\n", argv[0], args);
        free(args);
        OBJ_RELEASE(cmd_line);
        return ret;
    }

    /*
     * Since this process can now handle MCA/GMCA parameters, make sure to
     * process them.
     */
    mca_base_cmd_line_process_args(cmd_line, &environ, &environ);

    /* Ensure that enough of OPAL is setup for us to be able to run */
    /*
     * NOTE: (JJH)
     *  We need to allow 'mca_base_cmd_line_process_args()' to process command
     *  line arguments *before* calling opal_init_util() since the command
     *  line could contain MCA parameters that affect the way opal_init_util()
     *  functions. AMCA parameters are one such option normally received on the
     *  command line that affect the way opal_init_util() behaves.
     *  It is "safe" to call mca_base_cmd_line_process_args() before
     *  opal_init_util() since mca_base_cmd_line_process_args() does *not*
     *  depend upon opal_init_util() functionality.
     */
    if (OPAL_SUCCESS != opal_init_util(&argc, &argv)) {
        fprintf(stderr, "OPAL failed to initialize -- orted aborting\n");
        exit(1);
    }

    /* save the environment for launch purposes. This MUST be
     * done so that we can pass it to any local procs we
     * spawn - otherwise, those local procs won't see any
     * non-MCA envars that were set in the enviro when the
     * orted was executed - e.g., by .csh
     */
    orte_launch_environ = opal_argv_copy(environ);

    /* purge any ess/pmix flags set in the environ when we were launched */
    opal_unsetenv(OPAL_MCA_PREFIX"ess", &orte_launch_environ);
    opal_unsetenv(OPAL_MCA_PREFIX"pmix", &orte_launch_environ);

    /* if orte_daemon_debug is set, let someone know we are alive right
     * away just in case we have a problem along the way
     */
    if (orted_globals.debug) {
        gethostname(hostname, sizeof(hostname));
        fprintf(stderr, "Daemon was launched on %s - beginning to initialize\n", hostname);
    }

    /* check for help request */
    if (orted_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(cmd_line);
        orte_show_help("help-orted.txt", "orted:usage", false,
                       argv[0], args);
        free(args);
        return 1;
    }
#if defined(HAVE_SETSID)
    /* see if we were directed to separate from current session */
    if (orted_globals.set_sid) {
        setsid();
    }
#endif
    /* see if they want us to spin until they can connect a debugger to us */
    i=0;
    while (orted_spin_flag) {
        i++;
        if (1000 < i) i=0;
    }

#if OPAL_ENABLE_FT_CR == 1
    /* Mark as a tool program */
    (void) mca_base_var_env_name ("opal_cr_is_tool", &tmp_env_var);
    opal_setenv(tmp_env_var,
                "1",
                true, &environ);
    free(tmp_env_var);
#endif

    /* detach from controlling terminal
     * otherwise, remain attached so output can get to us
     */
    if(!orte_debug_flag &&
       !orte_debug_daemons_flag &&
       orted_globals.daemonize) {
        opal_daemon_init(NULL);
    }

    /* Set the flag telling OpenRTE that I am NOT a
     * singleton, but am "infrastructure" - prevents setting
     * up incorrect infrastructure that only a singleton would
     * require.
     */
    if (orted_globals.hnp) {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_HNP))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    } else {
        if (ORTE_SUCCESS != (ret = orte_init(&argc, &argv, ORTE_PROC_DAEMON))) {
            ORTE_ERROR_LOG(ret);
            return ret;
        }
    }

    /* finalize the OPAL utils. As they are opened again from orte_init->opal_init
     * we continue to have a reference count on them. So we have to finalize them twice...
     */
    opal_finalize_util();

    /* bind ourselves if so directed */
    if (NULL != orte_daemon_cores) {
        char **cores=NULL, tmp[128];
        hwloc_obj_t pu;
        hwloc_cpuset_t ours, res;
        int core;

        /* could be a collection of comma-delimited ranges, so
         * use our handy utility to parse it
         */
        orte_util_parse_range_options(orte_daemon_cores, &cores);
        if (NULL != cores) {
            ours = hwloc_bitmap_alloc();
            hwloc_bitmap_zero(ours);
            res = hwloc_bitmap_alloc();
            for (i=0; NULL != cores[i]; i++) {
                core = strtoul(cores[i], NULL, 10);
                if (NULL == (pu = opal_hwloc_base_get_pu(opal_hwloc_topology, core, OPAL_HWLOC_LOGICAL))) {
                    /* turn off the show help forwarding as we won't
                     * be able to cycle the event library to send
                     */
                    orte_show_help_finalize();
                    /* the message will now come out locally */
                    orte_show_help("help-orted.txt", "orted:cannot-bind",
                                   true, orte_process_info.nodename,
                                   orte_daemon_cores);
                    ret = ORTE_ERR_NOT_SUPPORTED;
                    hwloc_bitmap_free(ours);
                    hwloc_bitmap_free(res);
                    goto DONE;
                }
                hwloc_bitmap_or(res, ours, pu->cpuset);
                hwloc_bitmap_copy(ours, res);
            }
            /* if the result is all zeros, then don't bind */
            if (!hwloc_bitmap_iszero(ours)) {
                (void)hwloc_set_cpubind(opal_hwloc_topology, ours, 0);
                if (opal_hwloc_report_bindings) {
                    opal_hwloc_base_cset2mapstr(tmp, sizeof(tmp), opal_hwloc_topology, ours);
                    opal_output(0, "Daemon %s is bound to cores %s",
                                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), tmp);
                }
            }
            /* cleanup */
            hwloc_bitmap_free(ours);
            hwloc_bitmap_free(res);
            opal_argv_free(cores);
        }
    }

    if ((int)ORTE_VPID_INVALID != orted_debug_failure) {
        orted_globals.abort=false;
        /* some vpid was ordered to fail. The value can be positive
         * or negative, depending upon the desired method for failure,
         * so need to check both here
         */
        if (0 > orted_debug_failure) {
            orted_debug_failure = -1*orted_debug_failure;
            orted_globals.abort = true;
        }
        /* are we the specified vpid? */
        if ((int)ORTE_PROC_MY_NAME->vpid == orted_debug_failure) {
            /* if the user specified we delay, then setup a timer
             * and have it kill us
             */
            if (0 < orted_debug_failure_delay) {
                ORTE_TIMER_EVENT(orted_debug_failure_delay, 0, shutdown_callback, ORTE_SYS_PRI);

            } else {
                opal_output(0, "%s is executing clean %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                            orted_globals.abort ? "abort" : "abnormal termination");

                /* do -not- call finalize as this will send a message to the HNP
                 * indicating clean termination! Instead, just forcibly cleanup
                 * the local session_dir tree and exit
                 */
                orte_session_dir_cleanup(ORTE_JOBID_WILDCARD);

                /* if we were ordered to abort, do so */
                if (orted_globals.abort) {
                    abort();
                }

                /* otherwise, return with non-zero status */
                ret = ORTE_ERROR_DEFAULT_EXIT_CODE;
                goto DONE;
            }
        }
    }

    /* insert our contact info into our process_info struct so we
     * have it for later use and set the local daemon field to our name
     */
    orte_oob_base_get_addr(&orte_process_info.my_daemon_uri);
    if (NULL == orte_process_info.my_daemon_uri) {
        /* no way to communicate */
        ret = ORTE_ERROR;
        goto DONE;
    }
    ORTE_PROC_MY_DAEMON->jobid = ORTE_PROC_MY_NAME->jobid;
    ORTE_PROC_MY_DAEMON->vpid = ORTE_PROC_MY_NAME->vpid;

    /* if I am also the hnp, then update that contact info field too */
    if (ORTE_PROC_IS_HNP) {
        orte_process_info.my_hnp_uri = strdup(orte_process_info.my_daemon_uri);
        ORTE_PROC_MY_HNP->jobid = ORTE_PROC_MY_NAME->jobid;
        ORTE_PROC_MY_HNP->vpid = ORTE_PROC_MY_NAME->vpid;
    }

    /* setup the primary daemon command receive function */
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DAEMON,
                            ORTE_RML_PERSISTENT, orte_daemon_recv, NULL);

    /* output a message indicating we are alive, our name, and our pid
     * for debugging purposes
     */
    if (orte_debug_daemons_flag) {
        fprintf(stderr, "Daemon %s checking in as pid %ld on host %s\n",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), (long)orte_process_info.pid,
                orte_process_info.nodename);
    }

    /* We actually do *not* want the orted to voluntarily yield() the
       processor more than necessary.  The orted already blocks when
       it is doing nothing, so it doesn't use any more CPU cycles than
       it should; but when it *is* doing something, we do not want it
       to be unnecessarily delayed because it voluntarily yielded the
       processor in the middle of its work.

       For example: when a message arrives at the orted, we want the
       OS to wake up the orted in a timely fashion (which most OS's
       seem good about doing) and then we want the orted to process
       the message as fast as possible.  If the orted yields and lets
       aggressive MPI applications get the processor back, it may be a
       long time before the OS schedules the orted to run again
       (particularly if there is no IO event to wake it up).  Hence,
       routed OOB messages (for example) may be significantly delayed
       before being delivered to MPI processes, which can be
       problematic in some scenarios (e.g., COMM_SPAWN, BTL's that
       require OOB messages for wireup, etc.). */
    opal_progress_set_yield_when_idle(false);

    /* Change the default behavior of libevent such that we want to
       continually block rather than blocking for the default timeout
       and then looping around the progress engine again.  There
       should be nothing in the orted that cannot block in libevent
       until "something" happens (i.e., there's no need to keep
       cycling through progress because the only things that should
       happen will happen in libevent).  This is a minor optimization,
       but what the heck... :-) */
    opal_progress_set_event_flag(OPAL_EVLOOP_ONCE);

    /* if requested, report my uri to the indicated pipe */
    if (orted_globals.uri_pipe > 0) {
        orte_job_t *jdata;
        orte_proc_t *proc;
        orte_node_t *node;
        orte_app_context_t *app;
        char *tmp, *nptr, *sysinfo;
        char **singenv=NULL, *string_key, *env_str;

        /* setup the singleton's job */
        jdata = OBJ_NEW(orte_job_t);
        /* default to ompi for now */
        opal_argv_append_nosize(&jdata->personality, "ompi");
        orte_plm_base_create_jobid(jdata);
        opal_hash_table_set_value_uint32(orte_job_data, jdata->jobid, jdata);

        /* must create a map for it (even though it has no
         * info in it) so that the job info will be picked
         * up in subsequent pidmaps or other daemons won't
         * know how to route
         */
        jdata->map = OBJ_NEW(orte_job_map_t);

        /* setup an app_context for the singleton */
        app = OBJ_NEW(orte_app_context_t);
        app->app = strdup("singleton");
        app->num_procs = 1;
        opal_pointer_array_add(jdata->apps, app);
        jdata->num_apps = 1;

        /* setup a proc object for the singleton - since we
         * -must- be the HNP, and therefore we stored our
         * node on the global node pool, and since the singleton
         * -must- be on the same node as us, indicate that
         */
        proc = OBJ_NEW(orte_proc_t);
        proc->name.jobid = jdata->jobid;
        proc->name.vpid = 0;
        proc->parent = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_ALIVE);
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        /* obviously, it is on my node */
        node = (orte_node_t*)opal_pointer_array_get_item(orte_node_pool, 0);
        proc->node = node;
        OBJ_RETAIN(node);  /* keep accounting straight */
        opal_pointer_array_add(jdata->procs, proc);
        jdata->num_procs = 1;
        /* add the node to the job map */
        OBJ_RETAIN(node);
        opal_pointer_array_add(jdata->map->nodes, node);
        jdata->map->num_nodes++;
        /* and it obviously is on the node */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(node->procs, proc);
        node->num_procs++;
        /* and obviously it is one of my local procs */
        OBJ_RETAIN(proc);
        opal_pointer_array_add(orte_local_children, proc);
        jdata->num_local_procs = 1;
        /* set the trivial */
        proc->local_rank = 0;
        proc->node_rank = 0;
        proc->app_rank = 0;
        proc->state = ORTE_PROC_STATE_RUNNING;
        proc->app_idx = 0;
        ORTE_FLAG_SET(proc, ORTE_PROC_FLAG_LOCAL);

        /* set the ORTE_JOB_TRANSPORT_KEY from the environment */
        orte_pre_condition_transports(jdata, NULL);

        /* register the singleton's nspace with our PMIx server */
        if (ORTE_SUCCESS != (ret = orte_pmix_server_register_nspace(jdata, false))) {
          ORTE_ERROR_LOG(ret);
          goto DONE;
        }
        /* use setup fork to create the envars needed by the singleton */
        if (OPAL_SUCCESS != (ret = opal_pmix.server_setup_fork(&proc->name, &singenv))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* append the transport key to the envars needed by the singleton */
        if (!orte_get_attribute(&jdata->attributes, ORTE_JOB_TRANSPORT_KEY, (void**)&string_key, OPAL_STRING) || NULL == string_key) {
            ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
            goto DONE;
        }
        asprintf(&env_str, OPAL_MCA_PREFIX"orte_precondition_transports=%s", string_key);
        opal_argv_append_nosize(&singenv, env_str);
        free(env_str);

        nptr = opal_argv_join(singenv, '*');
        opal_argv_free(singenv);

        /* create a string that contains our uri + sysinfo + PMIx server URI envars */
        orte_util_convert_sysinfo_to_string(&sysinfo, orte_local_cpu_type, orte_local_cpu_model);
        asprintf(&tmp, "%s[%s]%s", orte_process_info.my_daemon_uri, sysinfo, nptr);
        free(sysinfo);
        free(nptr);

        /* pass that info to the singleton */
        if (OPAL_SUCCESS != (ret = opal_fd_write(orted_globals.uri_pipe, strlen(tmp)+1, tmp))) { ; /* need to add 1 to get the NULL */
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }

        /* cleanup */
        free(tmp);
        close(orted_globals.uri_pipe);

        /* since a singleton spawned us, we need to harvest
         * any MCA params from the local environment so
         * we can pass them along to any subsequent daemons
         * we may start as the result of a comm_spawn
         */
        for (i=0; NULL != environ[i]; i++) {
            if (0 == strncmp(environ[i], OPAL_MCA_PREFIX, 9)) {
                /* make a copy to manipulate */
                tmp = strdup(environ[i]);
                /* find the equal sign */
                nptr = strchr(tmp, '=');
                *nptr = '\0';
                nptr++;
                /* add the mca param to the orted cmd line */
                opal_argv_append_nosize(&orted_cmd_line, "-"OPAL_MCA_CMD_LINE_ID);
                opal_argv_append_nosize(&orted_cmd_line, &tmp[9]);
                opal_argv_append_nosize(&orted_cmd_line, nptr);
                free(tmp);
            }
        }
    }

    /* if we were given a pipe to monitor for singleton termination, set that up */
    if (orted_globals.singleton_died_pipe > 0) {
        /* register shutdown handler */
        pipe_handler = (opal_event_t*)malloc(sizeof(opal_event_t));
        opal_event_set(orte_event_base, pipe_handler,
                       orted_globals.singleton_died_pipe,
                       OPAL_EV_READ,
                       pipe_closed,
                       pipe_handler);
        opal_event_add(pipe_handler, NULL);
    }

    /* If I have a parent, then save his contact info so
     * any messages we send can flow thru him.
     */
    orte_parent_uri = NULL;
    (void) mca_base_var_register ("orte", "orte", NULL, "parent_uri",
                                  "URI for the parent if tree launch is enabled.",
                                  MCA_BASE_VAR_TYPE_STRING, NULL, 0,
                                  MCA_BASE_VAR_FLAG_INTERNAL,
                                  OPAL_INFO_LVL_9,
                                  MCA_BASE_VAR_SCOPE_CONSTANT,
                                  &orte_parent_uri);
    if (NULL != orte_parent_uri) {
        orte_process_name_t parent;
        opal_value_t val;

        /* set the contact info into our local database */
        ret = orte_rml_base_parse_uris(orte_parent_uri, &parent, NULL);
        if (ORTE_SUCCESS != ret) {
            ORTE_ERROR_LOG(ret);
            free (orte_parent_uri);
            orte_parent_uri = NULL;
            goto DONE;
        }
        OBJ_CONSTRUCT(&val, opal_value_t);
        val.key = OPAL_PMIX_PROC_URI;
        val.type = OPAL_STRING;
        val.data.string = orte_parent_uri;
        if (OPAL_SUCCESS != (ret = opal_pmix.store_local(&parent, &val))) {
            ORTE_ERROR_LOG(ret);
            OBJ_DESTRUCT(&val);
            goto DONE;
        }
        val.key = NULL;
        val.data.string = NULL;
        OBJ_DESTRUCT(&val);

        /* don't need this value anymore */
        free(orte_parent_uri);
        orte_parent_uri = NULL;

        /* tell the routed module that we have a path
         * back to the HNP
         */
        if (ORTE_SUCCESS != (ret = orte_routed.update_route(NULL, ORTE_PROC_MY_HNP, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
        /* set the lifeline to point to our parent so that we
         * can handle the situation if that lifeline goes away
         */
        if (ORTE_SUCCESS != (ret = orte_routed.set_lifeline(NULL, &parent))) {
            ORTE_ERROR_LOG(ret);
            goto DONE;
        }
    }

    /* if we are not the HNP...the only time we will be an HNP
     * is if we are launched by a singleton to provide support
     * for it
     */
    if (!ORTE_PROC_IS_HNP) {
        orte_process_name_t target;
        target.jobid = ORTE_PROC_MY_NAME->jobid;

        if (orte_fwd_mpirun_port || orte_static_ports) {
            /* setup the rollup callback */
            orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_ORTED_CALLBACK,
                                    ORTE_RML_PERSISTENT, rollup, NULL);
            target.vpid = ORTE_PROC_MY_NAME->vpid;
            /* since we will be waiting for any children to send us
             * their rollup info before sending to our parent, save
             * a little time in the launch phase by "warming up" the
             * connection to our parent while we wait for our children */
            buffer = OBJ_NEW(opal_buffer_t);  // zero-byte message
            if (0 > (ret = orte_rml.send_buffer_nb(orte_mgmt_conduit,
                                                   ORTE_PROC_MY_PARENT, buffer,
                                                   ORTE_RML_TAG_WARMUP_CONNECTION,
                                                   orte_rml_send_callback, NULL))) {
                ORTE_ERROR_LOG(ret);
                OBJ_RELEASE(buffer);
                goto DONE;
            }
        } else {
            target.vpid = 0;
        }

        /* send the information to the orted report-back point - this function
         * will process the data, but also counts the number of
         * orteds that reported back so the launch procedure can continue.
         * We need to do this at the last possible second as the HNP
         * can turn right around and begin issuing orders to us
         */

        buffer = OBJ_NEW(opal_buffer_t);
        /* insert our name for rollup purposes */
        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, ORTE_PROC_MY_NAME, 1, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            OBJ_RELEASE(buffer);
            goto DONE;
        }

        /* get any connection info we may have pushed */
        {
            opal_value_t *val = NULL, *kv;
            opal_list_t *modex;
            int32_t flag;

            if (OPAL_SUCCESS != (ret = opal_pmix.get(ORTE_PROC_MY_NAME, NULL, NULL, &val)) || NULL == val) {
                /* just pack a marker indicating we don't have any to share */
                flag = 0;
                if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                    ORTE_ERROR_LOG(ret);
                    OBJ_RELEASE(buffer);
                    goto DONE;
                }
            } else {
                /* the data is returned as a list of key-value pairs in the opal_value_t */
                if (OPAL_PTR == val->type) {
                    modex = (opal_list_t*)val->data.ptr;
                    flag = (int32_t)opal_list_get_size(modex);
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    OPAL_LIST_FOREACH(kv, modex, opal_value_t) {
                        if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &kv, 1, OPAL_VALUE))) {
                            ORTE_ERROR_LOG(ret);
                            OBJ_RELEASE(buffer);
                            goto DONE;
                        }
                    }
                    OPAL_LIST_RELEASE(modex);
                } else {
                    opal_output(0, "VAL KEY: %s", (NULL == val->key) ? "NULL" : val->key);
                    /* single value */
                    flag = 1;
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &flag, 1, OPAL_INT32))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                    if (ORTE_SUCCESS != (ret = opal_dss.pack(buffer, &val, 1, OPAL_VALUE))) {
                        ORTE_ERROR_LOG(ret);
                        OBJ_RELEASE(buffer);
                        goto DONE;
                    }
                }
                OBJ_RELEASE(val);
            }
コード例 #22
0
/*
 * Function to find as many components of a given type as possible.  This
 * includes statically-linked in components as well as opening up a
 * directory and looking for shared-library MCA components of the
 * appropriate type (load them if available).
 *
 * Return one consolidated array of (mca_base_component_t*) pointing to all
 * available components.
 */
int mca_base_component_find(const char *directory, const char *type, 
                            const mca_base_component_t *static_components[], 
                            const char *requested_components,
                            opal_list_t *found_components,
                            bool open_dso_components)
{
    char **requested_component_names = NULL;
    mca_base_component_list_item_t *cli;
    bool include_mode;
    int i, ret;

    ret = mca_base_component_parse_requested (requested_components, &include_mode,
                           &requested_component_names);
    if (OPAL_SUCCESS != ret) {
        return ret;
    }

    /* Find all the components that were statically linked in */
    OBJ_CONSTRUCT(found_components, opal_list_t);
    for (i = 0; NULL != static_components &&
             NULL != static_components[i]; ++i) {
        if ( use_component(include_mode,
                           (const char**)requested_component_names,
                           static_components[i]->mca_component_name) ) {
            cli = OBJ_NEW(mca_base_component_list_item_t);
            if (NULL == cli) {
                ret = OPAL_ERR_OUT_OF_RESOURCE;
                goto component_find_out;
            }
            cli->cli_component = static_components[i];
            opal_list_append(found_components, (opal_list_item_t *) cli);
        }
    }

#if OPAL_WANT_LIBLTDL
    /* Find any available dynamic components in the specified directory */
    if (open_dso_components && !mca_base_component_disable_dlopen) {
        find_dyn_components(directory, type,
                            (const char**)requested_component_names,
                            include_mode, found_components); 
    } else {
        opal_output_verbose(40, 0, 
                            "mca: base: component_find: dso loading for %s MCA components disabled", 
                            type);
    }
#endif

    if (include_mode) {
        ret = component_find_check (type, requested_component_names, found_components);
    } else {
        ret = OPAL_SUCCESS;
    }


    ret = OPAL_SUCCESS;

component_find_out:

    if (NULL != requested_component_names) {
        opal_argv_free(requested_component_names);
    }

    /* All done */

    return ret;
}
コード例 #23
0
ファイル: pubsub_orte.c プロジェクト: XuanWang1982/ompi
/*
 * delete the entry. Only the process who has published
 * the service_name has the right to remove this
 * service - the server will verify and report the result
 */
static int unpublish ( const char *service_name, ompi_info_t *info )
{
    int rc, ret, flag;
    bool global_scope;
    orte_process_name_t *info_host;
    opal_buffer_t *buf;
    orte_data_server_cmd_t cmd=ORTE_DATA_SERVER_UNPUBLISH;
    orte_std_cntr_t cnt;
    orte_rml_recv_cb_t xfer;

    ompi_info_get_bool(info, "ompi_global_scope", &global_scope, &flag);

    if (0 == flag) {
        /* scope was not defined - see if server exists */
        if (!server_setup) {
            setup_server();
        }
        if (mca_pubsub_orte_component.server_found) {
            /* server was found - use it as our default store */
            info_host = &mca_pubsub_orte_component.server;
            global_scope = true;
        } else {
            /* server was not found - use our HNP as default store */
            info_host = ORTE_PROC_MY_HNP;
        }
    } else if (!global_scope) {
        /* if the scope is not global, then unpublish the value from the HNP */
        info_host = ORTE_PROC_MY_HNP;
    } else {
        /* has the server been setup yet? */
        if (!server_setup) {
            setup_server();
        }
        /* unpublish the value from the global ompi_server, but error
        * if that server wasn't contacted
        */
        if (!mca_pubsub_orte_component.server_found) {
            opal_show_help("help-ompi-pubsub-orte.txt", "pubsub-orte:no-server",
                           true, (long)ORTE_PROC_MY_NAME->vpid, "unpublish from");
            return OMPI_ERR_NOT_FOUND;
        }
        info_host = &mca_pubsub_orte_component.server;
    }

    OPAL_OUTPUT_VERBOSE((1, ompi_pubsub_base_framework.framework_output,
                         "%s pubsub:orte: unpublish service %s scope %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                         service_name, global_scope ? "Global" : "Local"));

    /* construct the buffer */
    buf = OBJ_NEW(opal_buffer_t);

    /* pack the unpublish command */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &cmd, 1, ORTE_DATA_SERVER_CMD))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* pack the service name */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(buf, &service_name, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* send the command */
    if (0 > (rc = orte_rml.send_buffer_nb(info_host, buf, ORTE_RML_TAG_DATA_SERVER,
                                          orte_rml_send_callback, NULL))) {
        ORTE_ERROR_LOG(rc);
        OBJ_RELEASE(buf);
        goto CLEANUP;
    }

    /* get the answer */
    OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
    xfer.active = true;
    orte_rml.recv_buffer_nb(ORTE_NAME_WILDCARD, ORTE_RML_TAG_DATA_CLIENT,
                            ORTE_RML_NON_PERSISTENT,
                            orte_rml_recv_callback, &xfer);
    OMPI_WAIT_FOR_COMPLETION(xfer.active);

    /* unpack the result */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&xfer.data, &ret, &cnt, OPAL_INT))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&xfer);
        goto CLEANUP;
    }
    OBJ_DESTRUCT(&xfer);
    rc = ret;

CLEANUP:
    return rc;
}
コード例 #24
0
/*
 * Open up all directories in a given path and search for components of
 * the specified type (and possibly of a given name).
 *
 * Note that we use our own path iteration functionality (vs. ltdl's
 * lt_dladdsearchdir() functionality) because we need to look at
 * companion .ompi_info files in the same directory as the library to
 * generate dependencies, etc.  If we use the plain lt_dlopen()
 * functionality, we would not get the directory name of the file
 * finally opened in recursive dependency traversals.
 */
static void find_dyn_components(const char *path, const char *type_name, 
                                const char **names, bool include_mode,
                                opal_list_t *found_components)
{
    int i, len;
    char *path_to_use = NULL, *dir, *end;
    component_file_item_t *file;
    opal_list_item_t *cur;
    char prefix[32 + MCA_BASE_MAX_TYPE_NAME_LEN], *basename;
    
    /* If path is NULL, iterate over the set of directories specified by
       the MCA param mca_base_component_path.  If path is not NULL, then
       use that as the path. */
  
    if (NULL == path) {
        if (NULL != mca_base_component_path) {
            path_to_use = strdup (mca_base_component_path);
        } else {
            /* If there's no path, then there's nothing to search -- we're
               done */
            return;
        }

        if (NULL == path_to_use) {
            /* out of memory */
            return;
        }
    } else {
        path_to_use = strdup(path);
    }
  
    /* If we haven't done so already, iterate over all the files in
       the directories in the path and make a master array of all the
       matching filenames that we find.  Save the filenames in an
       argv-style array.  Re-scan do this if the mca_component_path
       has changed. */
    if (NULL == found_filenames || 
        (NULL != last_path_to_use && 
         0 != strcmp(path_to_use, last_path_to_use))) {
        if (NULL != found_filenames) {
            opal_argv_free(found_filenames);
            found_filenames = NULL;
            free(last_path_to_use);
            last_path_to_use = NULL;
        }
        if (NULL == last_path_to_use) {
            last_path_to_use = strdup(path_to_use);
        }

        dir = path_to_use;
        if (NULL != dir) {
            do {
                end = strchr(dir, OPAL_ENV_SEP);
                if (NULL != end) {
                    *end = '\0';
                }
                if ((0 == strcmp(dir, "USER_DEFAULT") ||
                     0 == strcmp(dir, "USR_DEFAULT"))
                    && NULL != mca_base_user_default_path) {
                    if (0 != lt_dlforeachfile(mca_base_user_default_path,
                                              save_filename, NULL)) {
                        break;
                    }
                } else if (0 == strcmp(dir, "SYS_DEFAULT") ||
                           0 == strcmp(dir, "SYSTEM_DEFAULT")) {
                    if (0 != lt_dlforeachfile(mca_base_system_default_path,
                                              save_filename, NULL)) {
                        break;
                    }                    
                } else {
                    if (0 != lt_dlforeachfile(dir, save_filename, NULL)) {
                        break;
                    }
                }
                dir = end + 1;
            } while (NULL != end);
        }
    }
    
    /* Look through the list of found files and find those that match
       the desired framework name */
    snprintf(prefix, sizeof(prefix) - 1, component_template, type_name);
    len = strlen(prefix);
    OBJ_CONSTRUCT(&found_files, opal_list_t);
    for (i = 0; NULL != found_filenames && NULL != found_filenames[i]; ++i) {
        basename = strrchr(found_filenames[i], '/');
        if (NULL == basename) {
            basename = found_filenames[i];
        } else {
            basename += 1;
        }
        
        if (0 != strncmp(basename, prefix, len)) {
            continue;
        }
        
        /* We found a match; save all the relevant details in the
           found_files list */
        file = OBJ_NEW(component_file_item_t);
        if (NULL == file) {
            return;
        }
        strncpy(file->type, type_name, MCA_BASE_MAX_TYPE_NAME_LEN);
        file->type[MCA_BASE_MAX_TYPE_NAME_LEN] = '\0';
        strncpy(file->name, basename + len, MCA_BASE_MAX_COMPONENT_NAME_LEN);
        file->name[MCA_BASE_MAX_COMPONENT_NAME_LEN] = '\0';
        strncpy(file->basename, basename, OPAL_PATH_MAX);
        file->basename[OPAL_PATH_MAX] = '\0';
        strncpy(file->filename, found_filenames[i], OPAL_PATH_MAX);
        file->filename[OPAL_PATH_MAX] = '\0';
        file->status = UNVISITED;

        opal_list_append(&found_files, (opal_list_item_t *) 
                         file);
    }

    /* Iterate through all the filenames that we found that matched
       the framework we were looking for.  Since one component may
       [try to] call another to be loaded, only try to load the
       UNVISITED files.  Also, ignore the return code -- basically,
       give every file one chance to try to load.  If they load,
       great.  If not, great. */
    for (cur = opal_list_get_first(&found_files); 
         opal_list_get_end(&found_files) != cur;
         cur = opal_list_get_next(cur)) {
        file = (component_file_item_t *) cur;

        if( UNVISITED == file->status ) {
            bool op = true;
            file->status = CHECKING_CYCLE;

            op = use_component(include_mode, names, file->name);
            if( true == op ) {
                open_component(file, found_components);
            }
        }
    }
    
    /* So now we have a final list of loaded components.  We can free all
       the file information. */
    for (cur = opal_list_remove_first(&found_files); 
         NULL != cur;
         cur = opal_list_remove_first(&found_files)) {
        OBJ_RELEASE(cur);
    }
    OBJ_DESTRUCT(&found_files);

    /* All done, now let's cleanup */
    free(path_to_use);
}
コード例 #25
0
ファイル: lookup_name.c プロジェクト: kawashima-fj/ompi
int MPI_Lookup_name(const char *service_name, MPI_Info info, char *port_name)
{
    char range[OPAL_MAX_INFO_VAL];
    int flag=0, ret;
    opal_value_t *rng;
    opal_list_t results, pinfo;
    opal_pmix_pdata_t *pdat;

    if ( MPI_PARAM_CHECK ) {
        OMPI_ERR_INIT_FINALIZE(FUNC_NAME);

        if ( NULL == port_name ) {
            return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
                                          FUNC_NAME);
        }
        if ( NULL == service_name ) {
            return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
                                          FUNC_NAME);
        }
        if (NULL == info || ompi_info_is_freed(info)) {
            return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_INFO,
                                          FUNC_NAME);
        }
    }

    OPAL_CR_ENTER_LIBRARY();

    OBJ_CONSTRUCT(&pinfo, opal_list_t);

    /* OMPI supports info keys to pass the range to
     * be searched for the given key */
    if (MPI_INFO_NULL != info) {
        ompi_info_get (info, "range", sizeof(range) - 1, range, &flag);
        if (flag) {
            if (0 == strcmp(range, "nspace")) {
                rng = OBJ_NEW(opal_value_t);
                rng->key = strdup(OPAL_PMIX_RANGE);
                rng->type = OPAL_INT;
                rng->data.integer = OPAL_PMIX_NAMESPACE;  // share only with procs in same nspace
                opal_list_append(&pinfo, &rng->super);
            } else if (0 == strcmp(range, "session")) {
                rng = OBJ_NEW(opal_value_t);
                rng->key = strdup(OPAL_PMIX_RANGE);
                rng->type = OPAL_INT;
                rng->data.integer = OPAL_PMIX_SESSION; // share only with procs in same session
                opal_list_append(&pinfo, &rng->super);
            } else {
                /* unrecognized scope */
                OPAL_LIST_DESTRUCT(&pinfo);
                return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_ARG,
                                            FUNC_NAME);
            }
        }
    }

    /* collect the findings */
    OBJ_CONSTRUCT(&results, opal_list_t);
    pdat = OBJ_NEW(opal_pmix_pdata_t);
    pdat->value.key = strdup(service_name);
    opal_list_append(&results, &pdat->super);

    ret = opal_pmix.lookup(&results, &pinfo);
    OPAL_LIST_DESTRUCT(&pinfo);
    if (OPAL_SUCCESS != ret ||
        OPAL_STRING != pdat->value.type ||
        NULL == pdat->value.data.string) {
        return OMPI_ERRHANDLER_INVOKE(MPI_COMM_WORLD, MPI_ERR_NAME,
                                      FUNC_NAME);
    }

    strncpy ( port_name, pdat->value.data.string, MPI_MAX_PORT_NAME );
    OPAL_LIST_DESTRUCT(&results);

    OPAL_CR_EXIT_LIBRARY();
    return MPI_SUCCESS;
}
コード例 #26
0
ファイル: rmaps_ppr.c プロジェクト: jjhursey/ompi
static int ppr_mapper(orte_job_t *jdata)
{
    int rc = ORTE_SUCCESS, j, n;
    mca_base_component_t *c=&mca_rmaps_ppr_component.base_version;
    orte_node_t *node;
    orte_proc_t *proc;
    orte_app_context_t *app;
    orte_vpid_t total_procs, nprocs_mapped;
    opal_hwloc_level_t start=OPAL_HWLOC_NODE_LEVEL;
    hwloc_obj_t obj;
    hwloc_obj_type_t lowest;
    unsigned cache_level=0;
    unsigned int nobjs, i;
    bool pruning_reqd = false;
    opal_hwloc_level_t level;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_slots;
    orte_app_idx_t idx;
    char **ppr_req, **ck;
    size_t len;
    bool initial_map=true;

    /* only handle initial launch of loadbalanced
     * or NPERxxx jobs - allow restarting of failed apps
     */
    if (ORTE_FLAG_TEST(jdata, ORTE_JOB_FLAG_RESTART)) {
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s being restarted - ppr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL == jdata->map->ppr ||
        ORTE_MAPPING_PPR != ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* not for us */
        opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                            "mca:rmaps:ppr: job %s not using ppr mapper PPR %s policy %s",
                            ORTE_JOBID_PRINT(jdata->jobid),
                            (NULL == jdata->map->ppr) ? "NULL" : jdata->map->ppr,
                            (ORTE_MAPPING_PPR == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) ? "PPRSET" : "PPR NOTSET");
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: mapping job %s with ppr %s",
                        ORTE_JOBID_PRINT(jdata->jobid), jdata->map->ppr);

    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* initialize */
    memset(ppr, 0, OPAL_HWLOC_HWTHREAD_LEVEL * sizeof(opal_hwloc_level_t));

    /* parse option */
    n=0;
    ppr_req = opal_argv_split(jdata->map->ppr, ',');
    for (j=0; NULL != ppr_req[j]; j++) {
        /* split on the colon */
        ck = opal_argv_split(ppr_req[j], ':');
        if (2 != opal_argv_count(ck)) {
            /* must provide a specification */
            orte_show_help("help-orte-rmaps-ppr.txt", "invalid-ppr", true, jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        len = strlen(ck[1]);
        if (0 == strncasecmp(ck[1], "node", len)) {
            ppr[OPAL_HWLOC_NODE_LEVEL] = strtol(ck[0], NULL, 10);
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNODE);
            start = OPAL_HWLOC_NODE_LEVEL;
            n++;
        } else if (0 == strncasecmp(ck[1], "hwthread", len) ||
                   0 == strncasecmp(ck[1], "thread", len)) {
            ppr[OPAL_HWLOC_HWTHREAD_LEVEL] = strtol(ck[0], NULL, 10);
            start = OPAL_HWLOC_HWTHREAD_LEVEL;
            ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYHWTHREAD);
            n++;
        } else if (0 == strncasecmp(ck[1], "core", len)) {
            ppr[OPAL_HWLOC_CORE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_CORE_LEVEL) {
                start = OPAL_HWLOC_CORE_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYCORE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "socket", len) ||
                   0 == strncasecmp(ck[1], "skt", len)) {
            ppr[OPAL_HWLOC_SOCKET_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_SOCKET_LEVEL) {
                start = OPAL_HWLOC_SOCKET_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYSOCKET);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l1cache", len)) {
            ppr[OPAL_HWLOC_L1CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L1CACHE_LEVEL) {
                start = OPAL_HWLOC_L1CACHE_LEVEL;
                cache_level = 1;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL1CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l2cache", len)) {
            ppr[OPAL_HWLOC_L2CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L2CACHE_LEVEL) {
                start = OPAL_HWLOC_L2CACHE_LEVEL;
                cache_level = 2;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL2CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "l3cache", len)) {
            ppr[OPAL_HWLOC_L3CACHE_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_L3CACHE_LEVEL) {
                start = OPAL_HWLOC_L3CACHE_LEVEL;
                cache_level = 3;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYL3CACHE);
            }
            n++;
        } else if (0 == strncasecmp(ck[1], "numa", len)) {
            ppr[OPAL_HWLOC_NUMA_LEVEL] = strtol(ck[0], NULL, 10);
            if (start < OPAL_HWLOC_NUMA_LEVEL) {
                start = OPAL_HWLOC_NUMA_LEVEL;
                ORTE_SET_MAPPING_POLICY(jdata->map->mapping, ORTE_MAPPING_BYNUMA);
            }
            n++;
        } else {
            /* unknown spec */
            orte_show_help("help-orte-rmaps-ppr.txt", "unrecognized-ppr-option", true, ck[1], jdata->map->ppr);
            opal_argv_free(ppr_req);
            opal_argv_free(ck);
            return ORTE_ERR_SILENT;
        }
        opal_argv_free(ck);
    }
    opal_argv_free(ppr_req);
    /* if nothing was given, that's an error */
    if (0 == n) {
        opal_output(0, "NOTHING GIVEN");
        return ORTE_ERR_SILENT;
    }
    /* if more than one level was specified, then pruning will be reqd */
    if (1 < n) {
        pruning_reqd = true;
    }

    opal_output_verbose(5, orte_rmaps_base_framework.framework_output,
                        "mca:rmaps:ppr: job %s assigned policy %s",
                        ORTE_JOBID_PRINT(jdata->jobid),
                        orte_rmaps_base_print_mapping(jdata->map->mapping));

    /* convenience */
    level = start;
    lowest = opal_hwloc_levels[start];

    for (idx=0; idx < (orte_app_idx_t)jdata->apps->size; idx++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, idx))) {
            continue;
        }

        /* if the number of total procs was given, set that
         * limit - otherwise, set to max so we simply fill
         * all the nodes with the pattern
         */
        if (0 < app->num_procs) {
            total_procs = app->num_procs;
        } else {
            total_procs = ORTE_VPID_MAX;
        }

        /* get the available nodes */
        OBJ_CONSTRUCT(&node_list, opal_list_t);
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);

        /* cycle across the nodes */
        nprocs_mapped = 0;
        for (item = opal_list_get_first(&node_list);
             item != opal_list_get_end(&node_list);
             item = opal_list_get_next(item)) {
            node = (orte_node_t*)item;
            /* bozo check */
            if (NULL == node->topology) {
                orte_show_help("help-orte-rmaps-ppr.txt", "ppr-topo-missing",
                               true, node->name);
                rc = ORTE_ERR_SILENT;
                goto error;
            }
            /* add the node to the map, if needed */
            if (!ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_MAPPED)) {
                if (ORTE_SUCCESS > (rc = opal_pointer_array_add(jdata->map->nodes, (void*)node))) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_MAPPED);
                OBJ_RETAIN(node);  /* maintain accounting on object */
                jdata->map->num_nodes++;
            }
            /* if we are mapping solely at the node level, just put
             * that many procs on this node
             */
            if (OPAL_HWLOC_NODE_LEVEL == start) {
                obj = hwloc_get_root_obj(node->topology);
                for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                    if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                        rc = ORTE_ERR_OUT_OF_RESOURCE;
                        goto error;
                    }
                    nprocs_mapped++;
                    orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                }
            } else {
                /* get the number of lowest resources on this node */
                nobjs = opal_hwloc_base_get_nbobjs_by_type(node->topology,
                                                           lowest, cache_level,
                                                           OPAL_HWLOC_AVAILABLE);

                /* map the specified number of procs to each such resource on this node,
                 * recording the locale of each proc so we know its cpuset
                 */
                for (i=0; i < nobjs; i++) {
                    obj = opal_hwloc_base_get_obj_by_type(node->topology,
                                                          lowest, cache_level,
                                                          i, OPAL_HWLOC_AVAILABLE);
                    for (j=0; j < ppr[start] && nprocs_mapped < total_procs; j++) {
                        if (NULL == (proc = orte_rmaps_base_setup_proc(jdata, node, idx))) {
                            rc = ORTE_ERR_OUT_OF_RESOURCE;
                            goto error;
                        }
                        nprocs_mapped++;
                        orte_set_attribute(&proc->attributes, ORTE_PROC_HWLOC_LOCALE, ORTE_ATTR_LOCAL, obj, OPAL_PTR);
                    }
                }

                if (pruning_reqd) {
                    /* go up the ladder and prune the procs according to
                     * the specification, adjusting the count of procs on the
                     * node as we go
                     */
                    level--;
                    prune(jdata->jobid, idx, node, &level, &nprocs_mapped);
                }
            }

            /* set the total slots used */
            if ((int)node->num_procs <= node->slots) {
                node->slots_inuse = (int)node->num_procs;
            } else {
                node->slots_inuse = node->slots;
            }

            /* if no-oversubscribe was specified, check to see if
             * we have violated the total slot specification - regardless,
             * if slots_max was given, we are not allowed to violate it!
             */
            if ((node->slots < (int)node->num_procs) ||
                (0 < node->slots_max && node->slots_max < (int)node->num_procs)) {
                if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                    orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                   true, node->num_procs, app->app);
                    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                    rc = ORTE_ERR_SILENT;
                    goto error;
                }
                /* flag the node as oversubscribed so that sched-yield gets
                 * properly set
                 */
                ORTE_FLAG_SET(node, ORTE_NODE_FLAG_OVERSUBSCRIBED);
                ORTE_FLAG_SET(jdata, ORTE_JOB_FLAG_OVERSUBSCRIBED);
                /* check for permission */
                if (ORTE_FLAG_TEST(node, ORTE_NODE_FLAG_SLOTS_GIVEN)) {
                    /* if we weren't given a directive either way, then we will error out
                     * as the #slots were specifically given, either by the host RM or
                     * via hostfile/dash-host */
                    if (!(ORTE_MAPPING_SUBSCRIBE_GIVEN & ORTE_GET_MAPPING_DIRECTIVE(orte_rmaps_base.mapping))) {
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    } else if (ORTE_MAPPING_NO_OVERSUBSCRIBE & ORTE_GET_MAPPING_DIRECTIVE(jdata->map->mapping)) {
                        /* if we were explicitly told not to oversubscribe, then don't */
                        orte_show_help("help-orte-rmaps-base.txt", "orte-rmaps-base:alloc-error",
                                       true, app->num_procs, app->app);
                        ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
                        return ORTE_ERR_SILENT;
                    }
                }
            }

            /* if we haven't mapped all the procs, continue on to the
             * next node
             */
            if (total_procs == nprocs_mapped) {
                break;
            }
        }
        if (0 == app->num_procs) {
            app->num_procs = nprocs_mapped;
        }
        if (ORTE_VPID_MAX != total_procs && nprocs_mapped < total_procs) {
            /* couldn't map them all */
            orte_show_help("help-orte-rmaps-ppr.txt", "ppr-too-many-procs",
                           true, app->app, app->num_procs, jdata->map->ppr);
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        /* compute vpids and add proc objects to the job */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* track the total number of processes we mapped - must update
         * this AFTER we compute vpids so that computation is done
         * correctly
         */
        jdata->num_procs += app->num_procs;

        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }
    return ORTE_SUCCESS;

 error:
    while (NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);
    return rc;
}
コード例 #27
0
/*
 * Create a round-robin mapping for the job.
 */
static int orte_rmaps_rr_map(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i;
    opal_list_t node_list;
    opal_list_item_t *item;
    orte_std_cntr_t num_nodes, num_slots;
    int rc;
    mca_base_component_t *c = &mca_rmaps_round_robin_component.base_version;
    bool initial_map=true;

    /* this mapper can only handle initial launch
     * when rr mapping is desired - allow
     * restarting of failed apps
     */
    if (ORTE_JOB_CONTROL_RESTART & jdata->controls) {
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:rr: job %s is being restarted - rr cannot map",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (NULL != jdata->map->req_mapper &&
        0 != strcasecmp(jdata->map->req_mapper, c->mca_component_name)) {
        /* a mapper has been specified, and it isn't me */
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }
    if (ORTE_MAPPING_RR < ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
        /* I don't know how to do these - defer */
        opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                            "mca:rmaps:rr: job %s not using rr mapper",
                            ORTE_JOBID_PRINT(jdata->jobid));
        return ORTE_ERR_TAKE_NEXT_OPTION;
    }

    opal_output_verbose(5, orte_rmaps_base.rmaps_output,
                        "mca:rmaps:rr: mapping job %s",
                        ORTE_JOBID_PRINT(jdata->jobid));
 
    /* flag that I did the mapping */
    if (NULL != jdata->map->last_mapper) {
        free(jdata->map->last_mapper);
    }
    jdata->map->last_mapper = strdup(c->mca_component_name);

    /* start at the beginning... */
    jdata->num_procs = 0;
    
    /* cycle through the app_contexts, mapping them sequentially */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        
        /* setup the nodelist here in case we jump to error */
        OBJ_CONSTRUCT(&node_list, opal_list_t);

        /* if the number of processes wasn't specified, then we know there can be only
         * one app_context allowed in the launch, and that we are to launch it across
         * all available slots. We'll double-check the single app_context rule first
         */
        if (0 == app->num_procs && 1 < jdata->num_apps) {
            orte_show_help("help-orte-rmaps-rr.txt", "orte-rmaps-rr:multi-apps-and-zero-np",
                           true, jdata->num_apps, NULL);
            rc = ORTE_ERR_SILENT;
            goto error;
        }

        /* for each app_context, we have to get the list of nodes that it can
         * use since that can now be modified with a hostfile and/or -host
         * option
         */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->mapping, initial_map, false))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        num_nodes = (orte_std_cntr_t)opal_list_get_size(&node_list);
        /* flag that all subsequent requests should not reset the node->mapped flag */
        initial_map = false;

        /* if a bookmark exists from some prior mapping, set us to start there */
        jdata->bookmark = orte_rmaps_base_get_starting_point(&node_list, jdata);
        
        if (0 == app->num_procs) {
            /* set the num_procs to equal the number of slots on these mapped nodes */
            app->num_procs = num_slots;
        }
        
        /* Make assignments */
        if (ORTE_MAPPING_BYNODE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_bynode(jdata, app, &node_list, num_slots,
                                      app->num_procs);
        } else if (ORTE_MAPPING_BYSLOT == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byslot(jdata, app, &node_list, num_slots,
                                      app->num_procs);
#if OPAL_HAVE_HWLOC
        } else if (ORTE_MAPPING_BYHWTHREAD == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_PU, 0);
        } else if (ORTE_MAPPING_BYCORE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CORE, 0);
        } else if (ORTE_MAPPING_BYL1CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 1);
        } else if (ORTE_MAPPING_BYL2CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 2);
        } else if (ORTE_MAPPING_BYL3CACHE == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_CACHE, 3);
        } else if (ORTE_MAPPING_BYSOCKET == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_SOCKET, 0);
        } else if (ORTE_MAPPING_BYNUMA == ORTE_GET_MAPPING_POLICY(jdata->map->mapping)) {
            rc = orte_rmaps_rr_byobj(jdata, app, &node_list, num_slots,
                                     app->num_procs, HWLOC_OBJ_NODE, 0);
#endif
        } else {
            /* unrecognized mapping directive */
            orte_show_help("help-orte-rmaps-base.txt", "unrecognized-policy",
                           true, "mapping",
                           orte_rmaps_base_print_mapping(jdata->map->mapping));
            rc = ORTE_ERR_SILENT;
            goto error;
        }
        if (ORTE_SUCCESS != rc) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }

        /* compute vpids and add proc objects to the job - do this after
         * each app_context so that the ranks within each context are
         * contiguous
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata, app, &node_list))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }

        /* track the total number of processes we mapped - must update
         * this value AFTER we compute vpids so that computation
         * is done correctly
         */
        jdata->num_procs += app->num_procs;

        /* cleanup the node list - it can differ from one app_context
         * to another, so we have to get it every time
         */
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        OBJ_DESTRUCT(&node_list);
    }

    return ORTE_SUCCESS;

 error:
    while(NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);

    return rc;
}
コード例 #28
0
ファイル: rmaps_lb.c プロジェクト: 315234/OpenFOAM-2.2.x-OSX
/*
 * Create a load balanced mapping for the job by assigning a constant #procs/node, with
 * leftovers being spread one/node starting from the first node.
 */
static int loadbalance(orte_job_t *jdata)
{
    orte_app_context_t *app;
    int i, j;
    opal_list_t node_list;
    orte_std_cntr_t num_nodes, num_slots;
    int rc=ORTE_SUCCESS, np, nprocs;
    int ppn = 0;
    opal_list_item_t *item, *start;
    orte_node_t *node;

    /* setup */
    OBJ_CONSTRUCT(&node_list, opal_list_t);

    /* compute total #procs we are going to add and the total number of nodes available */
    for(i=0; i < jdata->apps->size; i++) {
        if (NULL == (app = (orte_app_context_t*)opal_pointer_array_get_item(jdata->apps, i))) {
            continue;
        }
        /* get the nodes and #slots available for this app_context */
        if(ORTE_SUCCESS != (rc = orte_rmaps_base_get_target_nodes(&node_list, &num_slots, app,
                                                                  jdata->map->policy))) {
            ORTE_ERROR_LOG(rc);
            goto error;
        }
        if (0 < app->num_procs) {
            np = app->num_procs;
        } else {
            /* set the num_procs to the #slots */
            np = num_slots;
        }
        num_nodes = opal_list_get_size(&node_list);
        /* compute the base ppn */
        ppn = np / num_nodes;
        /* if a bookmark exists from some prior mapping, set us to start there */
        start = orte_rmaps_base_get_starting_point(&node_list, jdata);
        /* loop through the list of nodes until we either assign all the procs
         * or return to the starting point
         */
        item = start;
        nprocs = 0;
        do {
            node = (orte_node_t*)item;
            /* put the specified number of procs on each node */
            for (j=0; j < ppn; j++) {
                if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                     jdata->map->cpus_per_rank, app->idx,
                                                                     &node_list, jdata->map->oversubscribe,
                                                                     false, NULL))) {
                    /** if the code is ORTE_ERR_NODE_FULLY_USED, and we still have
                     * more procs to place, then that is an error
                     */
                    if (ORTE_ERR_NODE_FULLY_USED != rc ||
                        j < ppn-1) {
                        ORTE_ERROR_LOG(rc);
                        goto error;
                    }
                }
                nprocs++;
            }
            /* move to next node */
            if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
                item = opal_list_get_first(&node_list);
            }
            else {
                item = opal_list_get_next(item);
            }
        } while (item != start && nprocs < np);
        
        /* save the bookmark */
        jdata->bookmark = node;

        /* if we haven't assigned all the procs, then loop through the list
         * again, assigning 1 per node until all are assigned
         */
        item = start;
        while (nprocs < np) {
            node = (orte_node_t*)item;
            if (ORTE_SUCCESS != (rc = orte_rmaps_base_claim_slot(jdata, node,
                                                                 jdata->map->cpus_per_rank, app->idx,
                                                                 &node_list, jdata->map->oversubscribe,
                                                                 false, NULL))) {
                /* if the code is not ORTE_ERR_NODE_FULLY_USED, then that is an error */
                if (ORTE_ERR_NODE_FULLY_USED != rc) {
                    ORTE_ERROR_LOG(rc);
                    goto error;
                }
            }
            nprocs++;
            /* move to next node */
            if (opal_list_get_end(&node_list) == opal_list_get_next(item)) {
                item = opal_list_get_first(&node_list);
            }
            else {
                item = opal_list_get_next(item);
            }
        }
        /* save the bookmark */
        jdata->bookmark = node;
        /* update the number of procs in the job */
        jdata->num_procs += nprocs;
        
        /* cleanup */
        while (NULL != (item = opal_list_remove_first(&node_list))) {
            OBJ_RELEASE(item);
        }
        /* if the user requested a specific number of procs and
         * the total number of procs we were able to assign
         * doesn't equal the number requested, then we have a
         * problem
         */
        if (0 < app->num_procs && nprocs < app->num_procs) {
            orte_show_help("help-orte-rmaps-base.txt", "rmaps:too-many-procs", true,
                           app->app, app->num_procs,
                           "number of slots", nprocs,
                           "number of nodes", num_nodes);
            return ORTE_ERR_SILENT;
        }
        /* compute vpids and add proc objects to the job - this has to be
         * done after each app_context is mapped in order to keep the
         * vpids contiguous within an app_context
         */
        if (ORTE_SUCCESS != (rc = orte_rmaps_base_compute_vpids(jdata))) {
            ORTE_ERROR_LOG(rc);
            return rc;
        }
    }
    
error:
    while(NULL != (item = opal_list_remove_first(&node_list))) {
        OBJ_RELEASE(item);
    }
    OBJ_DESTRUCT(&node_list);

    return rc;
}
コード例 #29
0
ファイル: pmix_native.c プロジェクト: nrgraham23/ompi
static int native_get(const opal_process_name_t *id,
                      const char *key,
                      opal_value_t **kv)
{
    opal_buffer_t *msg, *bptr;
    pmix_cmd_t cmd = PMIX_GET_CMD;
    pmix_cb_t *cb;
    int rc, ret;
    int32_t cnt;
    opal_list_t vals;
    opal_value_t *kp;
    bool found;

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:native getting value for proc %s key %s",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME),
                        OPAL_NAME_PRINT(*id), key);

    /* first see if we already have the info in our dstore */
    OBJ_CONSTRUCT(&vals, opal_list_t);
    if (OPAL_SUCCESS == opal_dstore.fetch(opal_dstore_internal, id,
                                          key, &vals)) {
        *kv = (opal_value_t*)opal_list_remove_first(&vals);
        OPAL_LIST_DESTRUCT(&vals);
        opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                            "%s pmix:native value retrieved from dstore",
                            OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
        return OPAL_SUCCESS;
    }

    if (NULL == mca_pmix_native_component.uri) {
        /* no server available, so just return */
        return OPAL_ERR_NOT_FOUND;
    }

    /* nope - see if we can get it */
    msg = OBJ_NEW(opal_buffer_t);
    /* pack the get cmd */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, &cmd, 1, PMIX_CMD_T))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return rc;
    }
    /* pack the request information - we'll get the entire blob
     * for this proc, so we don't need to pass the key */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(msg, id, 1, OPAL_NAME))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(msg);
        return rc;
    }
    /* create a callback object as we need to pass it to the
     * recv routine so we know which callback to use when
     * the return message is recvd */
    cb = OBJ_NEW(pmix_cb_t);
    cb->active = true;

    /* push the message into our event base to send to the server */
    PMIX_ACTIVATE_SEND_RECV(msg, wait_cbfunc, cb);

    /* wait for the data to return */
    PMIX_WAIT_FOR_COMPLETION(cb->active);

    /* we have received the entire data blob for this process - unpack
     * and cache all values, keeping the one we requested to return
     * to the caller */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(&cb->data, &ret, &cnt, OPAL_INT))) {
        OPAL_ERROR_LOG(rc);
        OBJ_RELEASE(cb);
        return rc;
    }
    found = false;
    cnt = 1;
    while (OPAL_SUCCESS == (rc = opal_dss.unpack(&cb->data, &bptr, &cnt, OPAL_BUFFER))) {
        while (OPAL_SUCCESS == (rc = opal_dss.unpack(bptr, &kp, &cnt, OPAL_VALUE))) {
            opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                                "%s pmix:native retrieved %s (%s) from server for proc %s",
                                OPAL_NAME_PRINT(OPAL_PROC_MY_NAME), kp->key,
                                (OPAL_STRING == kp->type) ? kp->data.string : "NS",
                                OPAL_NAME_PRINT(*id));
            if (OPAL_SUCCESS != (ret = opal_dstore.store(opal_dstore_internal, id, kp))) {
                OPAL_ERROR_LOG(ret);
            }
            if (0 == strcmp(key, kp->key)) {
                *kv = kp;
                found = true;
            } else {
                OBJ_RELEASE(kp);
            }
        }
        if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
            OPAL_ERROR_LOG(rc);
        }
        OBJ_RELEASE(bptr);
        cnt = 1;
    }
    if (OPAL_ERR_UNPACK_READ_PAST_END_OF_BUFFER != rc) {
        OPAL_ERROR_LOG(rc);
    } else {
        rc = OPAL_SUCCESS;
    }
    OBJ_RELEASE(cb);

    opal_output_verbose(2, opal_pmix_base_framework.framework_output,
                        "%s pmix:native get completed",
                        OPAL_NAME_PRINT(OPAL_PROC_MY_NAME));
    if (found) {
        return OPAL_SUCCESS;
    }
    /* we didn't find the requested data - pass back a
     * status that indicates the source of the problem,
     * either during the data fetch, message unpacking,
     * or not found */
    *kv = NULL;
    if (OPAL_SUCCESS == rc) {
        if (OPAL_SUCCESS == ret) {
            rc = OPAL_ERR_NOT_FOUND;
        } else {
            rc = ret;
        }
    }
    return rc;
}
コード例 #30
0
ファイル: sensor_resusage.c プロジェクト: forzaclaudio/orcm
static void sample(orcm_sensor_sampler_t *sampler)
{
    opal_pstats_t *stats;
    opal_node_stats_t *nstats;
    int rc, i;
    orte_proc_t *child;
    opal_buffer_t buf, *bptr;
    char *comp;

    OPAL_OUTPUT_VERBOSE((1, orcm_sensor_base_framework.framework_output,
                         "sample:resusage sampling resource usage"));
    
    /* setup a buffer for our stats */
    OBJ_CONSTRUCT(&buf, opal_buffer_t);
    /* pack our name */
    comp = strdup("resusage");
    if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &comp, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return;
    }
    free(comp);

    /* update stats on ourself and the node */
    stats = OBJ_NEW(opal_pstats_t);
    nstats = OBJ_NEW(opal_node_stats_t);
    if (ORCM_SUCCESS != (rc = opal_pstat.query(orte_process_info.pid, stats, nstats))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(stats);
        OBJ_RELEASE(nstats);
        OBJ_DESTRUCT(&buf);
        return;
    }

    /* the stats framework can't know nodename or rank */
    strncpy(stats->node, orte_process_info.nodename, (OPAL_PSTAT_MAX_STRING_LEN - 1));
    stats->rank = ORTE_PROC_MY_NAME->vpid;
#if 0
    /* locally save the stats */
    if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&my_proc->stats, stats))) {
        OBJ_RELEASE(st);
    }
    if (NULL != (nst = (opal_node_stats_t*)opal_ring_buffer_push(&my_node->stats, nstats))) {
        /* release the popped value */
        OBJ_RELEASE(nst);
    }
#endif

    /* pack them */
    if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &orte_process_info.nodename, 1, OPAL_STRING))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return;
    }
    if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &nstats, 1, OPAL_NODE_STAT))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return;
    }
    if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
        ORTE_ERROR_LOG(rc);
        OBJ_DESTRUCT(&buf);
        return;
    }

    /* loop through our children and update their stats */
    if (NULL != orte_local_children) {
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_FLAG_ALIVE)) {
                continue;
            }
            if (0 == child->pid) {
                /* race condition */
                continue;
            }
            stats = OBJ_NEW(opal_pstats_t);
            if (ORCM_SUCCESS != opal_pstat.query(child->pid, stats, NULL)) {
                /* may hit a race condition where the process has
                 * terminated, so just ignore any error
                 */
                OBJ_RELEASE(stats);
                continue;
            }
            /* the stats framework can't know nodename or rank */
            strncpy(stats->node, orte_process_info.nodename, (OPAL_PSTAT_MAX_STRING_LEN - 1));
            stats->rank = child->name.vpid;
#if 0
            /* store it */
            if (NULL != (st = (opal_pstats_t*)opal_ring_buffer_push(&child->stats, stats))) {
                OBJ_RELEASE(st);
            }
#endif
            /* pack them */
            if (OPAL_SUCCESS != (rc = opal_dss.pack(&buf, &stats, 1, OPAL_PSTAT))) {
                ORTE_ERROR_LOG(rc);
                OBJ_DESTRUCT(&buf);
                return;
            }
        }
    }

    /* xfer any data for transmission */
    if (0 < buf.bytes_used) {
        bptr = &buf;
        if (OPAL_SUCCESS != (rc = opal_dss.pack(&sampler->bucket, &bptr, 1, OPAL_BUFFER))) {
            ORTE_ERROR_LOG(rc);
            OBJ_DESTRUCT(&buf);
            return;
        }
    }
    OBJ_DESTRUCT(&buf);

#if 0
    /* are there any issues with node-level usage? */
    nst = (opal_node_stats_t*)opal_ring_buffer_poke(&my_node->stats, -1);
    if (NULL != nst && 0.0 < mca_sensor_resusage_component.node_memory_limit) {
        OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output,
                             "%s CHECKING NODE MEM",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* compute the percentage of node memory in-use */
        in_use = 1.0 - (nst->free_mem / nst->total_mem);
        OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output,
                             "%s PERCENT USED: %f LIMIT: %f",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                             in_use, mca_sensor_resusage_component.node_memory_limit));
        if (mca_sensor_resusage_component.node_memory_limit <= in_use) {
            /* loop through our children and find the biggest hog */
            hog = NULL;
            max_mem = 0.0;
            for (i=0; i < orte_local_children->size; i++) {
                if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                    continue;
                }
                if (!ORTE_FLAG_TEST(child, ORTE_PROC_IS_ALIVE)) {
                    continue;
                }
                if (0 == child->pid) {
                    /* race condition */
                    continue;
                }
                if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
                    continue;
                }
                OPAL_OUTPUT_VERBOSE((5, orcm_sensor_base_framework.framework_output,
                                     "%s PROC %s AT VSIZE %f",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&child->name), st->vsize));
                if (max_mem < st->vsize) {
                    hog = child;
                    max_mem = st->vsize;
                }
            }
            if (NULL == hog) {
                /* if all children dead and we are still too big,
                 * then we must be the culprit - abort
                 */
                OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output,
                                     "%s NO CHILD: COMMITTING SUICIDE",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
                orte_errmgr.abort(ORCM_ERR_MEM_LIMIT_EXCEEDED, NULL);
            } else {
                /* report the problem */
                OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output,
                                     "%s REPORTING %s TO ERRMGR FOR EXCEEDING LIMITS",
                                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                     ORTE_NAME_PRINT(&hog->name)));
                ORTE_ACTIVATE_PROC_STATE(&hog->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
            }
            /* since we have ordered someone to die, we've done enough for this
             * time around - don't check proc limits as well
             */
            return;
        }
    }

    /* check proc limits */
    if (0.0 < mca_sensor_resusage_component.proc_memory_limit) {
        OPAL_OUTPUT_VERBOSE((2, orcm_sensor_base_framework.framework_output,
                             "%s CHECKING PROC MEM",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
        /* check my children first */
        for (i=0; i < orte_local_children->size; i++) {
            if (NULL == (child = (orte_proc_t*)opal_pointer_array_get_item(orte_local_children, i))) {
                continue;
            }
            if (!ORTE_FLAG_TEST(child, ORTE_PROC_IS_ALIVE)) {
                continue;
            }
            if (0 == child->pid) {
                /* race condition */
                continue;
            }
            if (NULL == (st = (opal_pstats_t*)opal_ring_buffer_poke(&child->stats, -1))) {
                continue;
            }
            OPAL_OUTPUT_VERBOSE((5, orcm_sensor_base_framework.framework_output,
                                 "%s PROC %s AT VSIZE %f",
                                 ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                                 ORTE_NAME_PRINT(&child->name), st->vsize));
            if (mca_sensor_resusage_component.proc_memory_limit <= st->vsize) {
                /* report the problem */
                ORTE_ACTIVATE_PROC_STATE(&child->name, ORTE_PROC_STATE_SENSOR_BOUND_EXCEEDED);
            }
        }
    }
#endif
}