コード例 #1
0
HYD_status HYD_pmcd_pmip_get_params(char **t_argv)
{
    char **argv = t_argv;
    static char dbg_prefix[2 * MAX_HOSTNAME_LEN];
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    argv++;
    do {
        /* Get the proxy arguments  */
        status = HYDU_parse_array(&argv, HYD_pmcd_pmip_match_table);
        HYDU_ERR_POP(status, "error parsing input array\n");

        /* No more arguments left */
        if (!(*argv))
            break;
    } while (1);

    /* Verify the arguments we got */
    if (HYD_pmcd_pmip.upstream.server_name == NULL)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "server name not available\n");

    if (HYD_pmcd_pmip.upstream.server_port == -1)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "server port not available\n");

    if (HYD_pmcd_pmip.user_global.demux == NULL)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "demux engine not available\n");

    if (HYD_pmcd_pmip.user_global.debug == -1)
        HYD_pmcd_pmip.user_global.debug = 0;

    status = HYDT_bsci_init(HYD_pmcd_pmip.user_global.rmk,
                            HYD_pmcd_pmip.user_global.launcher,
                            HYD_pmcd_pmip.user_global.launcher_exec,
                            0 /* disable x */ , HYD_pmcd_pmip.user_global.debug);
    HYDU_ERR_POP(status, "proxy unable to initialize bootstrap server\n");

    if (HYD_pmcd_pmip.local.id == -1) {
        /* We didn't get a proxy ID during launch; query the launcher
         * for it. */
        status = HYDT_bsci_query_proxy_id(&HYD_pmcd_pmip.local.id);
        HYDU_ERR_POP(status, "unable to query launcher for proxy ID\n");
    }

    if (HYD_pmcd_pmip.local.id == -1)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "proxy ID not available\n");

    if (HYD_pmcd_pmip.local.pgid == -1)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "PG ID not available\n");

    if (HYD_pmcd_pmip.local.retries == -1)
        HYD_pmcd_pmip.local.retries = 0;

    HYDU_dbg_finalize();
    MPL_snprintf(dbg_prefix, 2 * MAX_HOSTNAME_LEN, "proxy:%d:%d",
                 HYD_pmcd_pmip.local.pgid, HYD_pmcd_pmip.local.id);
    status = HYDU_dbg_init((const char *) dbg_prefix);
    HYDU_ERR_POP(status, "unable to initialization debugging\n");

  fn_exit:
    HYDU_FUNC_EXIT();
    return status;

  fn_fail:
    goto fn_exit;
}
コード例 #2
0
ファイル: mpiexec.c プロジェクト: adk9/hydra
int main(int argc, char **argv)
{
    struct HYD_proxy *proxy;
    struct HYD_exec *exec;
    struct HYD_node *node;
    int exit_status = 0, i, timeout, reset_rmk, global_core_count;
    HYD_status status = HYD_SUCCESS;

    HYDU_FUNC_ENTER();

    /* Initialize engines that don't require use to know anything
     * about the user preferences first */
    status = HYDU_dbg_init("mpiexec");
    HYDU_ERR_POP(status, "unable to initialization debugging\n");

    status = HYDU_set_common_signals(signal_cb);
    HYDU_ERR_POP(status, "unable to set signal\n");

    status = HYDT_ftb_init();
    HYDU_ERR_POP(status, "unable to initialize FTB\n");

    /* Get user preferences */
    status = HYD_uii_mpx_get_parameters(argv);
    HYDU_ERR_POP(status, "error parsing parameters\n");

    /* Now we initialize engines that require us to know user
     * preferences */
#if HAVE_ALARM
    if (HYD_ui_mpich_info.ckpoint_int != -1)
        alarm(HYD_ui_mpich_info.ckpoint_int);
#endif /* HAVE_ALARM */

    /* The demux engine should be initialized before any sockets are
     * created, since it checks for STDIN's validity.  If STDIN was
     * closed and we opened a socket that got the same fd as STDIN,
     * this test will not be possible. */
    status = HYDT_dmx_init(&HYD_server_info.user_global.demux);
    HYDU_ERR_POP(status, "unable to initialize the demux engine\n");

    status =
        HYDT_bsci_init(HYD_server_info.user_global.rmk, HYD_server_info.user_global.launcher,
                       HYD_server_info.user_global.launcher_exec,
                       HYD_server_info.user_global.enablex, HYD_server_info.user_global.debug);
    HYDU_ERR_POP(status, "unable to initialize the bootstrap server\n");

    reset_rmk = 0;

    if (HYD_server_info.node_list == NULL) {
        /* Node list is not created yet. The user might not have
         * provided the host file. Query the RMK. */
        status = HYDT_bsci_query_node_list(&HYD_server_info.node_list);
        HYDU_ERR_POP(status, "unable to query the RMK for a node list\n");

        if (HYD_server_info.node_list == NULL) {
            char localhost[MAX_HOSTNAME_LEN] = { 0 };

            /* The RMK didn't give us anything back; use localhost */
            status = HYDU_gethostname(localhost);
            HYDU_ERR_POP(status, "unable to get local hostname\n");

            status = HYDU_add_to_node_list(localhost, 1, &HYD_server_info.node_list);
            HYDU_ERR_POP(status, "unable to add to node list\n");

            reset_rmk = 1;
        }
    }

    /*
     * If this is a checkpoint-restart, if the user specified the
     * number of processes, we already have a dummy executable. If the
     * number of processes came from the RMK, our executable list is
     * still NULL; a dummy executable needs to be created.
     */
    if (HYD_uii_mpx_exec_list == NULL) {
        HYDU_ASSERT(HYD_server_info.user_global.ckpoint_prefix, status);

        /* create a dummy executable */
        status = HYDU_alloc_exec(&HYD_uii_mpx_exec_list);
        HYDU_ERR_POP(status, "unable to allocate exec\n");
        HYD_uii_mpx_exec_list->appnum = 0;
    }

    if (HYD_server_info.user_global.debug)
        for (node = HYD_server_info.node_list; node; node = node->next)
            HYDU_dump_noprefix(stdout, "host: %s\n", node->hostname);

    /* Reset the host list to use only the number of processes per
     * node as specified by the ppn option. */
    if (HYD_ui_mpich_info.ppn != -1) {
        for (node = HYD_server_info.node_list; node; node = node->next)
            node->core_count = HYD_ui_mpich_info.ppn;
        reset_rmk = 1;
    }

    /* The RMK returned a node list. See if the user requested us to
     * manipulate it in some way */
    if (HYD_ui_mpich_info.sort_order != NONE) {
        qsort_node_list();
        reset_rmk = 1;
    }

    if (reset_rmk) {
        /* Reassign node IDs to each node */
        for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++)
            node->node_id = i;

        /* Reinitialize the bootstrap server with the "user" RMK, so
         * it knows that we are not using the node list provided by
         * the RMK */
        status = HYDT_bsci_finalize();
        HYDU_ERR_POP(status, "unable to finalize bootstrap device\n");

        status = HYDT_bsci_init("user", HYDT_bsci_info.launcher, HYDT_bsci_info.launcher_exec,
                                HYDT_bsci_info.enablex, HYDT_bsci_info.debug);
        HYDU_ERR_POP(status, "unable to reinitialize the bootstrap server\n");
    }

    /* If the number of processes is not given, we allocate all the
     * available nodes to each executable */
    HYD_server_info.pg_list.pg_process_count = 0;
    for (exec = HYD_uii_mpx_exec_list; exec; exec = exec->next) {
        if (exec->proc_count == -1) {
            global_core_count = 0;
            for (node = HYD_server_info.node_list, i = 0; node; node = node->next, i++)
                global_core_count += node->core_count;
            exec->proc_count = global_core_count;
        }
        HYD_server_info.pg_list.pg_process_count += exec->proc_count;
    }

    status = HYDU_list_inherited_env(&HYD_server_info.user_global.global_env.inherited);
    HYDU_ERR_POP(status, "unable to get the inherited env list\n");

    status = HYDU_create_proxy_list(HYD_uii_mpx_exec_list, HYD_server_info.node_list,
                                    &HYD_server_info.pg_list);
    HYDU_ERR_POP(status, "unable to create proxy list\n");

    /* calculate the core count used by the PG */
    HYD_server_info.pg_list.pg_core_count = 0;
    for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next)
        HYD_server_info.pg_list.pg_core_count += proxy->node->core_count;

    /* If the user didn't specify a local hostname, try to find one in
     * the list of nodes passed to us */
    if (HYD_server_info.localhost == NULL) {
        /* See if the node list contains a remotely accessible localhost */
        for (node = HYD_server_info.node_list; node; node = node->next) {
            int is_local, remote_access;

            status = HYDU_sock_is_local(node->hostname, &is_local);
            HYDU_ERR_POP(status, "unable to check if %s is local\n", node->hostname);

            if (is_local) {
                status = HYDU_sock_remote_access(node->hostname, &remote_access);
                HYDU_ERR_POP(status, "unable to check if %s is remotely accessible\n",
                             node->hostname);

                if (remote_access)
                    break;
            }
        }

        if (node)
            HYD_server_info.localhost = HYDU_strdup(node->hostname);
        else {
            HYDU_MALLOC(HYD_server_info.localhost, char *, MAX_HOSTNAME_LEN, status);
            status = HYDU_gethostname(HYD_server_info.localhost);
            HYDU_ERR_POP(status, "unable to get local hostname\n");
        }
    }

    if (HYD_server_info.user_global.debug)
        HYD_uiu_print_params();

    if (MPL_env2int("MPIEXEC_TIMEOUT", &timeout) == 0)
        timeout = -1;   /* Infinite timeout */

    if (HYD_server_info.user_global.debug)
        HYDU_dump(stdout, "Timeout set to %d (-1 means infinite)\n", timeout);

    /* Check if the user wants us to use a port within a certain
     * range. */
    if (MPL_env2str("MPIEXEC_PORTRANGE", (const char **) &HYD_server_info.port_range) ||
        MPL_env2str("MPIEXEC_PORT_RANGE", (const char **) &HYD_server_info.port_range) ||
        MPL_env2str("MPICH_PORT_RANGE", (const char **) &HYD_server_info.port_range))
        HYD_server_info.port_range = HYDU_strdup(HYD_server_info.port_range);

    /* Add the stdout/stderr callback handlers */
    HYD_server_info.stdout_cb = HYD_uiu_stdout_cb;
    HYD_server_info.stderr_cb = HYD_uiu_stderr_cb;

    /* Create a pipe connection to wake up the process manager */
    if (pipe(HYD_server_info.cmd_pipe) < 0)
        HYDU_ERR_SETANDJUMP(status, HYD_INTERNAL_ERROR, "pipe error\n");

    /* Launch the processes */
    status = HYD_pmci_launch_procs();
    HYDU_ERR_POP(status, "process manager returned error launching processes\n");

    /* Wait for their completion */
    status = HYD_pmci_wait_for_completion(timeout);
    HYDU_ERR_POP(status, "process manager error waiting for completion\n");

    /* Check for the exit status for all the processes */
    if (HYD_ui_mpich_info.print_all_exitcodes)
        HYDU_dump(stdout, "Exit codes: ");
    exit_status = 0;
    for (proxy = HYD_server_info.pg_list.proxy_list; proxy; proxy = proxy->next) {
        if (proxy->exit_status == NULL) {
            /* We didn't receive the exit status for this proxy */
            continue;
        }

        if (HYD_ui_mpich_info.print_all_exitcodes)
            HYDU_dump_noprefix(stdout, "[%s] ", proxy->node->hostname);

        for (i = 0; i < proxy->proxy_process_count; i++) {
            if (HYD_ui_mpich_info.print_all_exitcodes) {
                HYDU_dump_noprefix(stdout, "%d", proxy->exit_status[i]);
                if (i < proxy->proxy_process_count - 1)
                    HYDU_dump_noprefix(stdout, ",");
            }

            exit_status |= proxy->exit_status[i];
        }

        if (HYD_ui_mpich_info.print_all_exitcodes)
            HYDU_dump_noprefix(stdout, "\n");
    }

    /* Call finalize functions for lower layers to cleanup their resources */
    status = HYD_pmci_finalize();
    HYDU_ERR_POP(status, "process manager error on finalize\n");

    status = HYDT_ftb_finalize();
    HYDU_ERR_POP(status, "error finalizing FTB\n");

#if defined ENABLE_PROFILING
    if (HYD_server_info.enable_profiling) {
        HYDU_dump_noprefix(stdout, "\n");
        HYD_DRAW_LINE(80);
        HYDU_dump(stdout, "Number of PMI calls seen by the server: %d\n",
                  HYD_server_info.num_pmi_calls);
        HYD_DRAW_LINE(80);
        HYDU_dump_noprefix(stdout, "\n");
    }
#endif /* ENABLE_PROFILING */

    /* Free the mpiexec params */
    HYD_uiu_free_params();
    HYDU_free_exec_list(HYD_uii_mpx_exec_list);

  fn_exit:
    HYDU_dbg_finalize();
    HYDU_FUNC_EXIT();
    if (status == HYD_GRACEFUL_ABORT)
        return 0;
    else if (status != HYD_SUCCESS)
        return -1;
    else if (WIFSIGNALED(exit_status)) {
        printf("YOUR APPLICATION TERMINATED WITH THE EXIT STRING: %s (signal %d)\n",
               strsignal(WTERMSIG(exit_status)), WTERMSIG(exit_status));
        printf("This typically refers to a problem with your application.\n");
        printf("Please see the FAQ page for debugging suggestions\n");
        return exit_status;
    }
    else if (WIFEXITED(exit_status)) {
        return WEXITSTATUS(exit_status);
    }
    else if (WIFSTOPPED(exit_status)) {
        return WSTOPSIG(exit_status);
    }
    else {
        return exit_status;
    }

  fn_fail:
    goto fn_exit;
}