Beispiel #1
0
static int init(void)
{
    char cwd[OPAL_PATH_MAX];
    int rc;

    OPAL_OUTPUT_VERBOSE((5, orcm_pvsn_base_framework.framework_output,
                         "%s pvsn:wwulf:init",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));
    
    if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, OPAL_PATH_MAX))) {
        return rc;
    }

    /* check to see if we can execute wwsh */
    cmd = opal_path_findv("wwsh", X_OK, environ, cwd);
    if (NULL == cmd) {
        return ORTE_ERR_EXE_NOT_FOUND;
    }

    OPAL_OUTPUT_VERBOSE((5, orcm_pvsn_base_framework.framework_output,
                         "%s pvsn:wwulf:init path to wwsh %s",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), cmd));

    return ORCM_SUCCESS;
}
Beispiel #2
0
static void check_debugger(int fd, short event, void *arg)
{
    struct timeval now;
    opal_event_t *tmp = (opal_event_t*)arg;
    orte_job_t *jdata;
    orte_app_context_t *app;
    char cwd[OPAL_PATH_MAX];
    int rc;
    int32_t ljob;

    if (MPIR_being_debugged) {
        if (orte_debug_flag) {
            opal_output(0, "%s Launching debugger %s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
            MPIR_executable_path);
        }
        
        /* a debugger has attached! All the MPIR_Proctable
         * data is already available, so we only need to
         * check to see if we should spawn any daemons
         */
        if ('\0' != MPIR_executable_path[0]) {
            /* this will be launched just like a regular job,
             * so we do not use the global orte_debugger_daemon
             * as this is reserved for co-location upon startup
             */
            jdata = OBJ_NEW(orte_job_t);
            /* create a jobid for these daemons - this is done solely
             * to avoid confusing the rest of the system's bookkeeping
             */
            orte_plm_base_create_jobid(jdata);
            /* flag the job as being debugger daemons */
            jdata->controls |= ORTE_JOB_CONTROL_DEBUGGER_DAEMON;
            /* unless directed, we do not forward output */
            if (!MPIR_forward_output) {
                jdata->controls &= ~ORTE_JOB_CONTROL_FORWARD_OUTPUT;
            }
            /* set the mapping policy to "pernode" so we only get
             * one debugger daemon on each node
             */
            jdata->map = OBJ_NEW(orte_job_map_t);
            jdata->map->npernode = 1;
            /* add it to the global job pool */
            ljob = ORTE_LOCAL_JOBID(jdata->jobid);
            opal_pointer_array_set_item(orte_job_data, ljob, jdata);
            /* create an app_context for the debugger daemon */
            app = OBJ_NEW(orte_app_context_t);
            app->app = strdup((char*)MPIR_executable_path);
            if (OPAL_SUCCESS != (rc = opal_getcwd(cwd, sizeof(cwd)))) {
                orte_show_help("help-orterun.txt", "orterun:init-failure",
                               true, "get the cwd", rc);
                OBJ_RELEASE(jdata);
                goto RELEASE;
            }
            app->cwd = strdup(cwd);
            app->user_specified_cwd = false;
            opal_argv_append_nosize(&app->argv, app->app);
            build_debugger_args(app);
            opal_pointer_array_add(jdata->apps, &app->super);
            jdata->num_apps = 1;
            /* now go ahead and spawn this job */
            if (ORTE_SUCCESS != (rc = orte_plm.spawn(jdata))) {
                ORTE_ERROR_LOG(rc);
            }
        }
        
    RELEASE:
        /* notify the debugger that all is ready */
        MPIR_Breakpoint();

    } else {
        /* reissue the timer to wake us up again */
        now.tv_sec = orte_debugger_check_rate;
        now.tv_usec = 0;
        opal_evtimer_add(tmp, &now);
    }
}
Beispiel #3
0
int main(int argc, char *argv[])
{
    int32_t ret, i;
    opal_cmd_line_t cmd_line;
    char **inpt;
    opal_buffer_t *buf;
    int count;
    char cwd[OPAL_PATH_MAX];
    orcm_tool_cmd_t flag = ORCM_TOOL_STOP_CMD;
    int32_t master=0;
    uint16_t jfam=0;

    /***************
     * Initialize
     ***************/
    
    /*
     * Make sure to init util before parse_args
     * to ensure installdirs is setup properly
     * before calling mca_base_open();
     */
    if( ORTE_SUCCESS != (ret = orcm_init_util()) ) {
        return ret;
    }
    
    /* initialize the globals */
    my_globals.help = false;
    my_globals.replicas = NULL;
    my_globals.sched = NULL;
    my_globals.hnp_uri = NULL;
    
    /* Parse the command line options */
    opal_cmd_line_create(&cmd_line, cmd_line_opts);
    
    mca_base_open();
    mca_base_cmd_line_setup(&cmd_line);
    ret = opal_cmd_line_parse(&cmd_line, true, argc, argv);
    
    /* extract the MCA/GMCA params */
    mca_base_cmd_line_process_args(&cmd_line, &environ, &environ);

    /**
     * Now start parsing our specific arguments
     */
    if (OPAL_SUCCESS != ret || my_globals.help) {
        char *args = NULL;
        args = opal_cmd_line_get_usage_msg(&cmd_line);
        orte_show_help("help-orcm-stop.txt", "usage", true, args);
        free(args);
        return ORTE_ERROR;
    }
    
    if (NULL != my_globals.sched) {
        if (0 == strncmp(my_globals.sched, "file", strlen("file")) ||
            0 == strncmp(my_globals.sched, "FILE", strlen("FILE"))) {
            char input[1024], *filename;
            FILE *fp;
        
            /* it is a file - get the filename */
            filename = strchr(my_globals.sched, ':');
            if (NULL == filename) {
                /* filename is not correctly formatted */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "scheduler", my_globals.sched);
                return ORTE_ERROR;
            }
            ++filename; /* space past the : */
        
            if (0 >= strlen(filename)) {
                /* they forgot to give us the name! */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "scheduler", my_globals.sched);
                return ORTE_ERROR;
            }
        
            /* open the file and extract the pid */
            fp = fopen(filename, "r");
            if (NULL == fp) { /* can't find or read file! */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-access", true, "scheduler", filename);
                return ORTE_ERROR;
            }
            if (NULL == fgets(input, 1024, fp)) {
                /* something malformed about file */
                fclose(fp);
                orte_show_help("help-openrcm-runtime.txt", "hnp-file-bad", "scheduler", true, filename);
                return ORTE_ERROR;
            }
            fclose(fp);
            input[strlen(input)-1] = '\0';  /* remove newline */
            /* convert the pid */
            master = strtoul(input, NULL, 10);
        } else {
            /* should just be the master itself */
            master = strtoul(my_globals.sched, NULL, 10);
        }
    }

    /* if we were given HNP contact info, parse it and
     * setup the process_info struct with that info
     */
    if (NULL != my_globals.hnp_uri) {
        if (0 == strncmp(my_globals.hnp_uri, "file", strlen("file")) ||
            0 == strncmp(my_globals.hnp_uri, "FILE", strlen("FILE"))) {
            char input[1024], *filename;
            FILE *fp;
            
            /* it is a file - get the filename */
            filename = strchr(my_globals.hnp_uri, ':');
            if (NULL == filename) {
                /* filename is not correctly formatted */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "uri", my_globals.hnp_uri);
                goto cleanup;
            }
            ++filename; /* space past the : */
            
            if (0 >= strlen(filename)) {
                /* they forgot to give us the name! */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-bad", true, "uri", my_globals.hnp_uri);
                goto cleanup;
            }
            
            /* open the file and extract the uri */
            fp = fopen(filename, "r");
            if (NULL == fp) { /* can't find or read file! */
                orte_show_help("help-openrcm-runtime.txt", "hnp-filename-access", true, filename);
                goto cleanup;
            }
            if (NULL == fgets(input, 1024, fp)) {
                /* something malformed about file */
                fclose(fp);
                orte_show_help("help-openrcm-runtime.txt", "hnp-file-bad", true, filename);
                goto cleanup;
            }
            fclose(fp);
            input[strlen(input)-1] = '\0';  /* remove newline */
            /* put into the process info struct */
            orte_process_info.my_hnp_uri = strdup(input);
        } else {
            /* should just be the uri itself */
            orte_process_info.my_hnp_uri = strdup(my_globals.hnp_uri);
        }
    }
    
    if (OPAL_SUCCESS != opal_getcwd(cwd, sizeof(cwd))) {
        opal_output(orte_clean_output, "failed to get cwd\n");
        return ORTE_ERR_NOT_FOUND;
    }
    
    /***************************
     * We need all of OPAL and ORTE - this will
     * automatically connect us to the CM
     ***************************/
    if (ORTE_SUCCESS != orcm_init(ORCM_TOOL)) {
        orcm_finalize();
        return 1;
    }
    
    /* if we were given the hnp uri, extract the job family for the
     * master id
     */
    if (NULL != my_globals.hnp_uri) {
        master = ORTE_JOB_FAMILY(ORTE_PROC_MY_HNP->jobid);
    }
    
    /* register to receive responses */
    if (ORCM_SUCCESS != (ret = orcm_pnp.register_receive("orcm-stop", "0.1", "alpha",
                                                         ORCM_PNP_GROUP_INPUT_CHANNEL,
                                                         ORCM_PNP_TAG_TOOL,
                                                         ack_recv, NULL))) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }
    
    /* announce my existence */
    if (ORCM_SUCCESS != (ret = orcm_pnp.announce("orcm-stop", "0.1", "alpha", NULL))) {
        ORTE_ERROR_LOG(ret);
        goto cleanup;
    }
    
    /* setup the buffer to send our cmd */
    buf = OBJ_NEW(opal_buffer_t);
    
    /* indicate the scheduler to be used */
    jfam = master & 0x0000ffff;
    opal_dss.pack(buf, &jfam, 1, OPAL_UINT16);
    
    /* get the apps to stop */
    inpt = NULL;
    opal_cmd_line_get_tail(&cmd_line, &count, &inpt);
    
    if (0 == count) {
        /* if no apps were given, then we stop the entire
         * DVM itself by telling the daemon's to terminate
         */
        if (ORCM_SUCCESS != (ret = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                      NULL, ORCM_PNP_TAG_TERMINATE,
                                                      NULL, 0, buf, cbfunc, NULL))) {
            ORTE_ERROR_LOG(ret);
        }
        goto cleanup;
    } else {
        /* load the stop cmd */
        opal_dss.pack(buf, &flag, 1, ORCM_TOOL_CMD_T);
    
        /* for each app */
        for (i=0; NULL != inpt[i]; i++) {
            opal_dss.pack(buf, &inpt[i], 1, OPAL_STRING);
            /* pack the replicas to be stopped */
            opal_dss.pack(buf, &my_globals.replicas, 1, OPAL_STRING);
        }
        opal_argv_free(inpt);
    
        if (ORCM_SUCCESS != (ret = orcm_pnp.output_nb(ORCM_PNP_SYS_CHANNEL,
                                                      NULL, ORCM_PNP_TAG_TOOL,
                                                      NULL, 0, buf, cbfunc, NULL))) {
            ORTE_ERROR_LOG(ret);
        }
    }

    /* now wait for ack */
    opal_event_dispatch(opal_event_base);
    
    /***************
     * Cleanup
     ***************/
 cleanup:
    orcm_finalize();
    
    return ret;
}