示例#1
0
static int test_spawn_common(char *my_nspace, int my_rank, int blocking)
{
    int rc;
    pmix_app_t *apps;
    size_t napps;
    char nspace[PMIX_MAX_NSLEN+1];
    memset(nspace, 0, PMIX_MAX_NSLEN+1);
    napps = 1;
    PMIX_APP_CREATE(apps, napps);
    if (blocking) {
        if (PMIX_SUCCESS != (rc = PMIx_Spawn(NULL, 0, apps, napps, nspace))) {
            PMIX_APP_FREE(apps, napps);
            return rc;
        }
    } else {
        spawn_cbdata cbdata;
        cbdata.in_progress = 1;
        memset(cbdata.nspace, 0, PMIX_MAX_NSLEN);
        rc = PMIx_Spawn_nb(NULL, 0, apps, napps, spawn_cb, (void*)&cbdata);
        if (PMIX_SUCCESS != rc) {
            PMIX_APP_FREE(apps, napps);
            return rc;
        }
        PMIX_WAIT_FOR_COMPLETION(cbdata.in_progress);
        strncpy(nspace, cbdata.nspace, strlen(cbdata.nspace)+1);
    }
    PMIX_APP_FREE(apps, napps);
    if (strncmp(nspace, "foobar", strlen(nspace)+1)) {
        return PMIX_ERROR;
    }
    return rc;
}
示例#2
0
文件: pmi2.c 项目: AT95/ompi
int PMI2_Job_Spawn(int count, const char * cmds[],
                   int argcs[], const char ** argvs[],
                   const int maxprocs[],
                   const int info_keyval_sizes[],
                   const PMI_keyval_t *info_keyval_vectors[],
                   int preput_keyval_size,
                   const PMI_keyval_t *preput_keyval_vector[],
                   char jobId[], int jobIdSize,
                   int errors[])
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_app_t *apps;
    int i, k;
    size_t j;
    char *evar;

    PMI2_CHECK();

    if (NULL == cmds) {
        return PMI2_ERR_INVALID_ARGS;
    }

    /* setup the apps */
    PMIX_APP_CREATE(apps, count);
    for (i=0; i < count; i++) {
        apps[i].cmd = strdup(cmds[i]);
        apps[i].maxprocs = maxprocs[i];
        apps[i].argv = pmix_argv_copy((char**)argvs[i]);
        apps[i].argc = pmix_argv_count(apps[i].argv);
        apps[i].ninfo = info_keyval_sizes[i];
        apps[i].info = (pmix_info_t*)malloc(apps[i].ninfo * sizeof(pmix_info_t));
        /* copy the info objects */
        for (j=0; j < apps[i].ninfo; j++) {
            (void)strncpy(apps[i].info[j].key, info_keyval_vectors[i][j].key, PMIX_MAX_KEYLEN);
            apps[i].info[j].value.type = PMIX_STRING;
            apps[i].info[j].value.data.string = strdup(info_keyval_vectors[i][j].val);
        }
        /* push the preput values into the apps environ */
        for (k=0; k < preput_keyval_size; k++) {
            (void)asprintf(&evar, "%s=%s", preput_keyval_vector[j]->key, preput_keyval_vector[j]->val);
            pmix_argv_append_nosize(&apps[i].env, evar);
            free(evar);
        }
    }

    rc = PMIx_Spawn(NULL, 0, apps, count, NULL);
    /* tear down the apps array */
    for (i=0; i < count; i++) {
        PMIX_APP_DESTRUCT(&apps[i]);
    }
    free(apps);
    if (NULL != errors) {
        for (i=0; i < count; i++) {
            errors[i] = convert_err(rc);
        }
    }

    return convert_err(rc);
}
示例#3
0
static pmix_status_t spawn_debugger(char *appspace, myrel_t *myrel)
{
    pmix_status_t rc;
    pmix_info_t *dinfo;
    pmix_app_t *debugger;
    size_t dninfo;
    char cwd[1024];
    char dspace[PMIX_MAX_NSLEN+1];
    mylock_t mylock;
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;

    /* setup the debugger */
    PMIX_APP_CREATE(debugger, 1);
    debugger[0].cmd = strdup("./debuggerd");
    PMIX_ARGV_APPEND(rc, debugger[0].argv, "./debuggerd");
    getcwd(cwd, 1024);  // point us to our current directory
    debugger[0].cwd = strdup(cwd);
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    dninfo = 6;
    PMIX_INFO_CREATE(dinfo, dninfo);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUGGER_DAEMONS, NULL, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUG_JOB, appspace, PMIX_STRING); // the nspace being debugged
    PMIX_INFO_LOAD(&dinfo[2], PMIX_NOTIFY_COMPLETION, NULL, PMIX_BOOL); // notify us when the debugger job completes
    PMIX_INFO_LOAD(&dinfo[3], PMIX_DEBUG_WAITING_FOR_NOTIFY, NULL, PMIX_BOOL);  // tell the daemon that the proc is waiting to be released
    PMIX_INFO_LOAD(&dinfo[4], PMIX_FWD_STDOUT, NULL, PMIX_BOOL);  // forward stdout to me
    PMIX_INFO_LOAD(&dinfo[5], PMIX_FWD_STDERR, NULL, PMIX_BOOL);  // forward stderr to me
    /* spawn the daemons */
    fprintf(stderr, "Debugger: spawning %s\n", debugger[0].cmd);
    if (PMIX_SUCCESS != (rc = PMIx_Spawn(dinfo, dninfo, debugger, 1, dspace))) {
        fprintf(stderr, "Debugger daemons failed to launch with error: %s\n", PMIx_Error_string(rc));
        PMIX_INFO_FREE(dinfo, dninfo);
        PMIX_APP_FREE(debugger, 1);
        return rc;
    }
    /* cleanup */
    PMIX_INFO_FREE(dinfo, dninfo);
    PMIX_APP_FREE(debugger, 1);

    /* register callback for when this job terminates */
    myrel->nspace = strdup(dspace);
    PMIX_INFO_CREATE(dinfo, 2);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_EVENT_RETURN_OBJECT, myrel, PMIX_POINTER);
    /* only call me back when this specific job terminates */
    PMIX_INFO_LOAD(&dinfo[1], PMIX_NSPACE, dspace, PMIX_STRING);

    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(&code, 1, dinfo, 2,
                                release_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    PMIX_INFO_FREE(dinfo, 2);

    return rc;
}
示例#4
0
文件: simpdyn.c 项目: bosilca/ompi
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    pmix_proc_t proc;
    uint32_t nprocs;
    char nsp2[PMIX_MAX_NSLEN+1];
    pmix_app_t *app;
    char hostname[PMIX_MAXHOSTNAMELEN];
    pmix_proc_t *peers;
    size_t npeers, ntmp=0;
    char *nodelist;

    gethostname(hostname, sizeof(hostname));

    /* init us */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %d", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank);

    /* get our universe size */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs);

    /* call fence to sync */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* rank=0 calls spawn */
    if (0 == myproc.rank) {
        PMIX_APP_CREATE(app, 1);
        app->cmd = strdup("gumby");
        app->maxprocs = 2;
        pmix_argv_append_nosize(&app->argv, "gumby");
        pmix_argv_append_nosize(&app->argv, "-n");
        pmix_argv_append_nosize(&app->argv, "2");
        pmix_setenv("PMIX_ENV_VALUE", "3", true, &app->env);
        PMIX_INFO_CREATE(app->info, 2);
        (void)strncpy(app->info[0].key, "DARTH", PMIX_MAX_KEYLEN);
        app->info[0].value.type = PMIX_INT8;
        app->info[0].value.data.int8 = 12;
        (void)strncpy(app->info[1].key, "VADER", PMIX_MAX_KEYLEN);
        app->info[1].value.type = PMIX_DOUBLE;
        app->info[1].value.data.dval = 12.34;

        pmix_output(0, "Client ns %s rank %d: calling PMIx_Spawn", myproc.nspace, myproc.rank);
        if (PMIX_SUCCESS != (rc = PMIx_Spawn(NULL, 0, app, 1, nsp2))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Spawn failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        PMIX_APP_FREE(app, 1);

        /* check to see if we got the expected info back */
        if (0 != strncmp(nsp2, "DYNSPACE", PMIX_MAX_NSLEN)) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Spawn returned incorrect nspace: %s", myproc.nspace, myproc.rank, nsp2);
            goto done;
        } else {
            pmix_output(0, "Client ns %s rank %d: PMIx_Spawn succeeded returning nspace: %s", myproc.nspace, myproc.rank, nsp2);
        }
        /* get their universe size */
        (void)strncpy(proc.nspace, nsp2, PMIX_MAX_NSLEN);
        proc.rank = PMIX_RANK_WILDCARD;
        val = NULL;
        if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val)) ||
            NULL == val) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        ntmp = val->data.uint32;
        PMIX_VALUE_RELEASE(val);
        pmix_output(0, "Client %s:%d universe %s size %d", myproc.nspace, myproc.rank, nsp2, (int)ntmp);
    }

    /* just cycle the connect/disconnect functions */
    if (PMIX_SUCCESS != (rc = PMIx_Connect(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Connect failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    pmix_output(0, "Client ns %s rank %d: PMIx_Connect succeeded",
                myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Disconnect(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Disonnect failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    pmix_output(0, "Client ns %s rank %d: PMIx_Disconnect succeeded", myproc.nspace, myproc.rank);

    /* finally, test the resolve functions */
    if (0 == myproc.rank) {
        if (PMIX_SUCCESS != (rc = PMIx_Resolve_peers(hostname, NULL, &peers, &npeers))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers failed for nspace %s: %d", myproc.nspace, myproc.rank, nsp2, rc);
            goto done;
        }
        if ((nprocs+ntmp) != npeers) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers returned incorrect npeers: %d vs %d", myproc.nspace, myproc.rank, (int)(nprocs+ntmp), (int)npeers);
            goto done;
        }
        pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers returned %d npeers", myproc.nspace, myproc.rank, (int)npeers);
        if (PMIX_SUCCESS != (rc = PMIx_Resolve_nodes(nsp2, &nodelist))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_nodes failed for nspace %s: %d", myproc.nspace, myproc.rank, nsp2, rc);
            goto done;
        }
        pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_nodes %s", myproc.nspace, myproc.rank, nodelist);
    } else {
        if (PMIX_SUCCESS != (rc = PMIx_Resolve_peers(hostname, myproc.nspace, &peers, &npeers))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers failed for nspace %s: %d", myproc.nspace, myproc.rank, myproc.nspace, rc);
            goto done;
        }
        if (nprocs != npeers) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers returned incorrect npeers: %d vs %d", myproc.nspace, myproc.rank, nprocs, (int)npeers);
            goto done;
        }
        pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_peers returned %d npeers", myproc.nspace, myproc.rank, (int)npeers);
        if (PMIX_SUCCESS != (rc = PMIx_Resolve_nodes(myproc.nspace, &nodelist))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_nodes failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        pmix_output(0, "Client ns %s rank %d: PMIx_Resolve_nodes %s", myproc.nspace, myproc.rank, nodelist);
    }
    PMIX_PROC_FREE(peers, npeers);
    free(nodelist);

 done:
    /* call fence to sync */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* finalize us */
    pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank);

    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
示例#5
0
static int attach_to_running_job(char *nspace)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_query_t *query;
    size_t nq;
    myquery_data_t *q;

    /* query the active nspaces so we can verify that the
     * specified one exists */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_NAMESPACES);

    q = (myquery_data_t*)malloc(sizeof(myquery_data_t));
    DEBUG_CONSTRUCT_LOCK(&q->lock);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)q))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    DEBUG_WAIT_THREAD(&q->lock);
    DEBUG_DESTRUCT_LOCK(&q->lock);

    if (NULL == q->info) {
        fprintf(stderr, "Query returned no info\n");
        return -1;
    }
    /* the query should have returned a comma-delimited list of nspaces */
    if (PMIX_STRING != q->info[0].value.type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", q->info[0].value.type);
        return -1;
    }
    if (NULL == q->info[0].value.data.string) {
        fprintf(stderr, "Query returned no active nspaces\n");
        return -1;
    }

    fprintf(stderr, "Query returned %s\n", q->info[0].value.data.string);
    return 0;

#if 0
    /* split the returned string and look for the given nspace */

    /* if not found, then we have an error */
    PMIX_INFO_FREE(info, ninfo);

    /* get the proctable for this nspace */
    ninfo = 1;
    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, PMIX_QUERY_PROC_TABLE, PMIX_MAX_KEYLEN);
    (void)strncpy(info[0].qualifier, nspace, PMIX_MAX_KEYLEN);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(info, ninfo, infocbfunc, (void*)&active))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    /* wait to get a response */

    /* the query should have returned a data_array */
    if (PMIX_DATA_ARRAY != info[0].type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", info[0].type);
        return -1;
    }
    if (NULL == info[0].data.darray.array) {
        fprintf(stderr, "Query returned no proctable info\n");
        return -1;
    }
    /* the data array consists of a struct:
     *     size_t size;
     *     void* array;
     *
     * In this case, the array is composed of pmix_proc_info_t structs:
     *     pmix_proc_t proc;   // contains the nspace,rank of this proc
     *     char* hostname;
     *     char* executable_name;
     *     pid_t pid;
     *     int exit_code;
     *     pmix_proc_state_t state;
     */

    /* this is where a debugger tool would process the proctable to
     * create whatever blob it needs to provide to its daemons */
    PMIX_INFO_FREE(info, ninfo);

    /* setup the debugger daemon spawn request */
    napps = 1;
    PMIX_APP_CREATE(app, napps);
    /* setup the name of the daemon executable to launch */
    app[0].cmd = strdup("debuggerdaemon");
    app[0].argc = 1;
    app[0].argv = (char**)malloc(2*sizeof(char*));
    app[0].argv[0] = strdup("debuggerdaemon");
    app[0].argv[1] = NULL;
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    ninfo = 3;
    PMIX_INFO_CREATE(app[0].info, ninfo);
    PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&app[0].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&app[0].info[2], PMIX_DEBUG_TARGET, nspace, PMIX_STRING); // the "jobid" of the application to be debugged

    /* spawn the daemons */
    PMIx_Spawn(NULL, 0, app, napps, dspace);
    /* cleanup */
    PMIX_APP_FREE(app, napps);

    /* this is where a debugger tool would wait until the debug operation is complete */

    return 0;
#endif
}
示例#6
0
文件: scr_pmix_spawn.c 项目: LLNL/scr
int main(int argc, char **argv, const char **environ)
{

        pmix_status_t rc;

        pmix_info_t *info = NULL;

        bool flag;

        pmix_status_t retval;
        pmix_app_t *spawned_app = NULL;

        pmix_info_t *job_info = NULL;
        pmix_info_t *proc_info = NULL;
        int job_info_count = 0;
        int job_info_index = 0;
        int proc_info_count = 0;
        int proc_info_index = 0;

        char spawned_nsp[PMIX_MAX_NSLEN+1];
        char *path_to_app = NULL;
        char *host_to_use = NULL;
        int number_of_clients = 0;
        int temp_counter = 0;
        done_flag = false;
        gethostname(hostn, 500);
        int spawned_app_argc = 0;

        char **scr_environ = NULL;

        int proc_count = 1;
        int node_count = 0;
        bool blocking_mode = true;
        char *node_list = NULL;
        bool forward_all_scr_envs = false;
        bool pmix_mode = false;


        const char *optstring = "+n:N:L:x:bB:pPvhe";
        int temp_slen=0;
        /* todo: add arg parsing with ompi schizo */
        verbose_print = false;

        int sleep_max = 30;
        const int fixed_sleep = 5;
        int c;
        while((c = getopt(argc, argv, optstring)) != -1){
                switch(c){
                case 'h':
                        print_usage(argv[0]);
                        exit(0);
                        break;
                case 'n':
                        proc_count = atoi(optarg);
                        if(proc_count <= 0 || proc_count > 100){
                                printf("outside the range of allowable instances to spawn [1-100]\n");
                                exit(1);
                        }
                        if(verbose_print) {
                                printf("proc_count = %d\n", proc_count);
                        }
                        break;
                case 'N':
                        /* node_count = atoi(optarg); */
                        node_count = 1;
                        if(verbose_print) {
                                printf("node_count = %d\n", node_count);
                        }
                        break;
                case 'B':
                        blocking_mode = true;
                        sleep_max = atoi(optarg);
                        if(sleep_max < 0){
                                printf("can't sleep for less than 0 seconds\n");
                                exit(1);
                        }
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'b':
                        blocking_mode = false;
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'L':
                        node_list = optarg;
                        host_to_use = node_list;
                        if(verbose_print){
                                printf("node_list = '%s'\n", node_list);
                        }
                        break;
                case 'x':
                        temp_slen = strlen(optarg);
                        /*  check if the string is the same length as 'SCR', if so compare them */
                        if(temp_slen == strlen(SCR_STRING)){
                                if(strncmp(optarg, SCR_STRING, strlen(SCR_STRING)) == 0){
                                        /* if the string is SCR, then forward all SCR related env vars */
                                        if(verbose_print) printf("all scr envs will be forwarded\n");
                                        forward_all_scr_envs = true;
                                }
                                else{
                                        /* handled like a normal env var */
                                        handle_standard_env_var(optarg, &scr_environ);
                                }
                        }
                        else{
                                /*handled like a normal env var */
                                handle_standard_env_var(optarg, &scr_environ);
                        }
                        break;
                case 'v':
                        verbose_print = true;
                        break;
                case 'p':
                        pmix_mode = true;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'P':
                        pmix_mode = false;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'e':
                        experimental = true;
                        break;
                case '?':
                        printf("missing a required argument or invalid option: %x\n", optopt);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                default:
                        printf("Unrecognized argument: %c\n", c);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                }
        }

        /* number of instances to spawn */
        number_of_clients = proc_count;


        /* check to make sure an application was specified to launch */
        if( optind < argc ){
                /* if optind is < argc, it means there is at least one more arg
                 * beyond the args for this program */
                path_to_app = argv[optind];
                spawned_app_argc = argc - optind;
                if(verbose_print) {
                        printf("app to launch: %s @ %s:%d\n",
                               path_to_app, __FILE__, __LINE__);
                }
        }
        else{
                printf("program_to_spawn option was not provded\n");
                print_usage(argv[0]);
                exit(1);
        }

        if(verbose_print){
                printf("master process will spawn %d instances; app to run: %s\n\n",
                       number_of_clients, path_to_app);
                printf("pmix version: %s (host: %s)\n", PMIx_Get_version(), hostn);
        }
        /* init pmix */
        retval = PMIx_Init(&main_proc, NULL, 0);
        if(retval != PMIX_SUCCESS){
                error_helper(retval, hostn, "error initializing pmix");
                exit(0);
        }

        if(verbose_print){
                printf("rank %d, host '%s', nspace: '%s' init'd pmix succesfully\n\n",
                       main_proc.rank, hostn, main_proc.nspace);
        }



        /* we need to attach to a "system" PMIx server so we
         * can ask it to spawn applications for us. There can
         * only be one such connection on a node, so we will
         * instruct the tool library to only look for it */
        int ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        flag = true;
        PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_TO_SYSTEM, &flag, PMIX_BOOL);

        /* initialize the library and make the connection */
        if (PMIX_SUCCESS != (rc = PMIx_tool_init(&tool_proc, NULL, 0 ))) {
                fprintf(stderr, "PMIx_tool_init failed: %d\n", rc );
                exit(rc);
        }
        if (0 < ninfo) {
                PMIX_INFO_FREE(info, ninfo);
        }




        /* first call fence to sync all processes */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS)
        {
                error_helper(retval, hostn, "error fencing");
                exit(retval);
        }

        /* Process SCR env vars if needed */

        if(forward_all_scr_envs){
                parse_all_scr_envs(&scr_environ, environ);
        }

        /* finalize the env array so a NULL is in place */
        finalize_array(scr_environ);

        /* Setup info structs to pass to this: */
        /* pmix_info_t *error_info = NULL; */
        /*  PMIX_INFO_CREATE(error_info, 1); */
        /*
          strncpy(error_info[0].key, PMIX_ERROR_GROUP_ABORT, PMIX_MAX_KEYLEN);
          error_info[0].value.type = PMIX_BOOL;
          error_info[0].value.data.flag = true;
        */

        /*  strncpy(error_info[0].key, PMIX_ERROR_GROUP_SPAWN, PMIX_MAX_KEYLEN);
            int t_val = 1;
            pmix_value_load(&error_info[1].value, &t_val, PMIX_BOOL);
        */

        /*error_info[1].value.type = PMIX_BOOL;
        error_info[1].value.data.flag = true; */

        /*  strncpy(error_info[2].key, PMIX_ERROR_GROUP_GENERAL, PMIX_MAX_KEYLEN);
            error_info[2].value.type = PMIX_BOOL;
            error_info[2].value.data.flag = true;
        */


        /* TODO: setup error handling when implemented in pmix with the
         * following error codes: */
        /*
        pmix_status_t registered_codes[5];
        registered_codes[0] = PMIX_ERR_JOB_TERMINATED;
        registered_codes[1] = PMIX_ERR_PROC_ABORTED;
        registered_codes[2] = PMIX_ERR_PROC_ABORTING;
        */
        PMIx_Register_event_handler(NULL, 0,
                                    NULL, 0,
                                    errhandler_cb,
                                    errhandler_reg_callbk,
                                    (void *) NULL);

        /*  PMIX_INFO_DESTRUCT(error_info); */

        /* allocate memory to hold the spawend app struct */
        PMIX_APP_CREATE(spawned_app, 1);

        /* maxprocs isn't documented very well, but it appears to control
         * how many instances of the spanwed app are created */
        spawned_app->maxprocs = number_of_clients;

        /* set the app to run */
        (void)asprintf(&spawned_app->cmd, "%s", path_to_app);

        /* set argv for spawned app starting with remaining argv  */
        spawned_app->argv = &argv[optind];

        /* set the environment pointer */
        spawned_app->env = scr_environ;

        /*--START: add all proc level infos */

        /* add things to the proc level info */
        if(!pmix_mode){
                job_info_count++;
        }

        if(host_to_use != NULL){
                proc_info_count++;
        }

        if(verbose_print){
                printf("enabling debug feature for forwarding stdout/stderr\n");
                proc_info_count+=2;
                /* add PMIX_FWD_STDOUT and PMIX_FWD_STDERR later*/

        }

        if(experimental){
                job_info_count++;
        }
        if(node_count == 1){
                job_info_count++;
        }
        /*--END: add all proc level infos */


        /*--START: append actual proc level info */
        PMIX_INFO_CREATE(job_info, job_info_count);
        PMIX_INFO_CREATE(proc_info, proc_info_count);
        /* PMIX_VAL_set_assign(_v, _field, _val )  */
        /* PMIX_VAL_set_strdup(_v, _field, _val )  */

        if(host_to_use != NULL){
                /* add info struct to the spawned app itself for the host */

                /* old way */
                strncpy(proc_info[proc_info_index].key, PMIX_HOST, PMIX_MAX_KEYLEN);
                //proc_info[proc_info_index].value.type = PMIX_STRING;
                /* set the data for host list to use */
                //proc_info[proc_info_index].value.data.string = host_to_use;
                /* end old way */
                if(verbose_print) printf("about to set host val\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), string,
                                    host_to_use );
                proc_info_index++;
        }

        if(!pmix_mode){
                strncpy(job_info[job_info_index].key, PMIX_NON_PMI,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set non pmix flag\n");
                PMIX_VAL_SET(&(job_info[job_info_index].value), flag, true);
                job_info_index++;
        }
        if(verbose_print){
                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDOUT,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stdout flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;

                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDERR,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stderr flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;
        }
        if(experimental){
                printf("attempting to perform experiment\n");
                bool local_flag = true;
                PMIX_INFO_LOAD(&job_info[job_info_index], PMIX_NOTIFY_COMPLETION, &local_flag, PMIX_BOOL);
                job_info_index++;
        }
        if(node_count == 1){
                strncpy(job_info[job_info_index].key, PMIX_PPR,
                        PMIX_MAX_KEYLEN);
                PMIX_VAL_SET(&(job_info[job_info_index].value), string,
                             "1:n");
                job_info_index++;
        }
        /*--END: append actual proc level info */

        /* sanity check to make sure we covered all the info structs */
        if(proc_info_index != proc_info_count ){
                printf("bug: mismatch with appending proc info\n");
                exit(1);
        }
        if(job_info_index != job_info_count){
                printf("bug: mismatch with appending job info\n");
                exit(1);
        }

        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */

        /* fill in job_info */
        /*
        strncpy(job_info[0].key, PMIX_TIMEOUT, PMIX_MAX_KEYLEN);
        job_info[0].value.type = PMIX_INT;
        job_info[0].value.data.integer = 10; */

        /* strncpy(job_info[0].key, PMIX_NOTIFY_COMPLETION, PMIX_MAX_KEYLEN);
           job_info[0].value.type = PMIX_BOOL;
           job_info[0].value.data.flag = true; */

        /*strncpy(spawned_app->info[0].key, PMIX_DISPLAY_MAP, PMIX_MAX_KEYLEN);
          job_info[0].value.type = PMIX_BOOL;
          job_info[0].value.data.flag = true;*/


        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */
        spawned_app->info = proc_info;
        spawned_app->ninfo = proc_info_count;

        if(verbose_print){
                printf("proc level info count: %d\n", proc_info_count);
        }
        /* call spawn */
        retval = PMIx_Spawn(job_info, job_info_count,
                            spawned_app, 1, spawned_nsp);

        if(verbose_print) {
                printf("rank %d (host %s) just called spawn; spawned nspace: %s, retval:%d\n",
                       main_proc.rank,
                       hostn,
                       spawned_nsp,
                       retval);
        }
        if(retval != PMIX_SUCCESS){
                error_helper(retval,  hostn, "error with spawn");
                goto done;
        }

        /* TODO: TEMPORARY WORKAROUND TO WAIT FOR A SPAWNED PROCESS */
        if(blocking_mode){

                sleep(fixed_sleep);

                /* wait until app completes: */
                while(!done_flag){
                        sleep(fixed_sleep);
                        temp_counter++;
                        if(temp_counter*fixed_sleep >= sleep_max) {
                                if(verbose_print) printf("broke out early\n");
                                break;
                        }
                }
                if(verbose_print){
                        if(done_flag == true) {
                                printf("done_flag was set to true!\n");
                        }
                }

        }

done:
        /* fence first */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS){
                if(verbose_print) printf("error fencing, finalize may fail ! \n");
        }
        /* finalize */

        PMIx_Deregister_event_handler(_g_errhandler_ref, NULL, NULL);

        if(verbose_print){
                fprintf(stdout,
                        "spawn master process (rank %d) (host %s) finalizing\n",
                        main_proc.rank,
                        hostn);
        }

        /* clean up pmix */

        retval = PMIx_tool_finalize();

        if(retval == PMIX_SUCCESS)
        {
                if(verbose_print){
                        printf("spawn master process %d finalize success\n\n",
                               main_proc.rank);
                }
        }
        else
        {
                printf("spawn master process %d pmix_finalize FAILURE: %d\n\n",
                       main_proc.rank,
                       retval);
        }

        retval = PMIx_Finalize(NULL, 0);
        fflush(stdout);

        /*  cleanup before returning */
        PMIX_INFO_FREE(job_info, job_info_count);
        spawned_app->argv = NULL;
        PMIX_APP_FREE(spawned_app, 1);
        if(verbose_print) printf("%s exiting cleanly :)\n", argv[0]);
        return 0;

}
示例#7
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_info_t *info;
    pmix_app_t *app;
    size_t ninfo, napps;

    /* check for user directives - this would include:
     * - a flag indicating we want to attach to a specified application
     * - application info if we are launching a new app
     */

    /* init us - if a PMIx server pid was provided, then pass it along */
    if (0 < server_pid) {
        ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        PMIX_INFO_LOAD(&info[0], PMIX_SERVER_PIDINFO, server_pid, PMIX_UINT32);
    } else {
        info = NULL;
        ninfo = 0;
    }
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) {
        fprintf(stderr, "PMIx_tool_init failed: %d\n", rc);
        exit(rc);
    }
    if (0 < ninfo) {
        PMIX_INFO_FREE(info, ninfo);
    }
    fprintf(stderr, "Tool ns %s rank %d: Running\n", myproc.nspace, myproc.rank);

    /* if we are attaching to a running job, then attach to it */
    if (attach) {
        ret = attach_to_running_job(argv[1]);
    } else {
        /* this is an initial launch - we need to launch the application
         * plus the debugger daemons, letting the RM know we are debugging
         * so that it will "pause" the app procs until we are ready */
        napps = 2;
        PMIX_APP_CREATE(app, napps);
        /* setup the executable */
        app[0].cmd = strdup("app");
        app[0].argc = 1;
        app[0].argv = (char**)malloc(2*sizeof(char*));
        app[0].argv[0] = strdup("app");
        app[0].argv[1] = NULL;
        /* provide directives so the apps do what the user requested */
        ninfo = 2;
        PMIX_INFO_CREATE(app[0].info, ninfo);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_NP, 128, PMIX_UINT64);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "slot", PMIX_STRING);

        /* setup the name of the daemon executable to launch */
        app[1].cmd = strdup("debuggerdaemon");
        app[1].argc = 1;
        app[1].argv = (char**)malloc(2*sizeof(char*));
        app[1].argv[0] = strdup("debuggerdaemon");
        app[1].argv[1] = NULL;
        /* provide directives so the daemons go where we want, and
         * let the RM know these are debugger daemons */
        ninfo = 2;
        PMIX_INFO_CREATE(app[1].info, ninfo);
        PMIX_INFO_LOAD(&app[1].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
        PMIX_INFO_LOAD(&app[1].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
        /* spawn the daemons */
        PMIx_Spawn(NULL, 0, app, napps, dspace);
        /* cleanup */
        PMIX_APP_FREE(app, napps);

        /* this is where a debugger tool would wait until the debug operation is complete */
    }


 done:
    PMIx_tool_finalize(NULL, 0);

    return(ret);
}