Пример #1
0
static pmix_status_t spawn_debugger(char *appspace, myrel_t *myrel)
{
    pmix_status_t rc;
    pmix_info_t *dinfo;
    pmix_app_t *debugger;
    size_t dninfo;
    char cwd[1024];
    char dspace[PMIX_MAX_NSLEN+1];
    mylock_t mylock;
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;

    /* setup the debugger */
    PMIX_APP_CREATE(debugger, 1);
    debugger[0].cmd = strdup("./debuggerd");
    PMIX_ARGV_APPEND(rc, debugger[0].argv, "./debuggerd");
    getcwd(cwd, 1024);  // point us to our current directory
    debugger[0].cwd = strdup(cwd);
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    dninfo = 6;
    PMIX_INFO_CREATE(dinfo, dninfo);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUGGER_DAEMONS, NULL, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUG_JOB, appspace, PMIX_STRING); // the nspace being debugged
    PMIX_INFO_LOAD(&dinfo[2], PMIX_NOTIFY_COMPLETION, NULL, PMIX_BOOL); // notify us when the debugger job completes
    PMIX_INFO_LOAD(&dinfo[3], PMIX_DEBUG_WAITING_FOR_NOTIFY, NULL, PMIX_BOOL);  // tell the daemon that the proc is waiting to be released
    PMIX_INFO_LOAD(&dinfo[4], PMIX_FWD_STDOUT, NULL, PMIX_BOOL);  // forward stdout to me
    PMIX_INFO_LOAD(&dinfo[5], PMIX_FWD_STDERR, NULL, PMIX_BOOL);  // forward stderr to me
    /* spawn the daemons */
    fprintf(stderr, "Debugger: spawning %s\n", debugger[0].cmd);
    if (PMIX_SUCCESS != (rc = PMIx_Spawn(dinfo, dninfo, debugger, 1, dspace))) {
        fprintf(stderr, "Debugger daemons failed to launch with error: %s\n", PMIx_Error_string(rc));
        PMIX_INFO_FREE(dinfo, dninfo);
        PMIX_APP_FREE(debugger, 1);
        return rc;
    }
    /* cleanup */
    PMIX_INFO_FREE(dinfo, dninfo);
    PMIX_APP_FREE(debugger, 1);

    /* register callback for when this job terminates */
    myrel->nspace = strdup(dspace);
    PMIX_INFO_CREATE(dinfo, 2);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_EVENT_RETURN_OBJECT, myrel, PMIX_POINTER);
    /* only call me back when this specific job terminates */
    PMIX_INFO_LOAD(&dinfo[1], PMIX_NSPACE, dspace, PMIX_STRING);

    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(&code, 1, dinfo, 2,
                                release_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    PMIX_INFO_FREE(dinfo, 2);

    return rc;
}
Пример #2
0
void pmi_fence(int collect)
{
    pmix_info_t *info = NULL;
    pmix_proc_t proc;
    bool value = 1;
    int ninfo = 0;
    int rc;

    if( collect ){
        PMIX_INFO_CREATE(info, 1);
        (void)strncpy(info->key, PMIX_COLLECT_DATA, PMIX_MAX_KEYLEN);
        pmix_value_load(&info->value, &value, PMIX_BOOL);
        ninfo = 1;
    }

    /* call fence to ensure the data is received */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, this_proc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;

    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, ninfo))) {
        fprintf(stderr,  "Client ns %s rank %d: PMIx_Fence failed: %d",
                this_proc.nspace, this_proc.rank, rc);
        abort();
    }

    if( collect ){
        PMIX_INFO_FREE(info, ninfo);
    }
}
Пример #3
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_info_t *info;
    size_t ninfo;

    /* init us */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
        fprintf(stderr, "PMIx_tool_init failed: %d\n", rc);
        exit(rc);
    }
    pmix_output(0, "Tool ns %s rank %d: Running", myproc.nspace, myproc.rank);

    /* query something */
    ninfo = 2;
    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, "foobar", PMIX_MAX_KEYLEN);
    (void)strncpy(info[1].key, "spastic", PMIX_MAX_KEYLEN);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info(info, ninfo))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    if (0 != strncmp(info[0].key, "foobar", PMIX_MAX_KEYLEN)) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[0] wrong: %s vs foobar",
                    myproc.nspace, myproc.rank, info[0].key);
    }
    if (0 != strncmp(info[1].key, "spastic", PMIX_MAX_KEYLEN)) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[0] wrong: %s vs spastic",
                    myproc.nspace, myproc.rank, info[1].key);
    }
    if (PMIX_STRING != info[0].value.type) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[0] wrong type: %d vs %d",
                    myproc.nspace, myproc.rank, info[0].value.type, PMIX_STRING);
    }
    if (PMIX_STRING != info[1].value.type) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[1] wrong type: %d vs %d",
                    myproc.nspace, myproc.rank, info[1].value.type, PMIX_STRING);
    }
    if (0 != strcmp(info[0].value.data.string, "0")) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[0] wrong value: %s vs 0",
                    myproc.nspace, myproc.rank, info[1].value.data.string);
    }
    if (0 != strcmp(info[1].value.data.string, "1")) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Query_info key[1] wrong value: %s vs 1",
                    myproc.nspace, myproc.rank, info[1].value.data.string);
    }
    PMIX_INFO_FREE(info, ninfo);

 done:
    /* finalize us */
    pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(rc);
}
Пример #4
0
static void errdes(pmix_error_reg_info_t *p)
{
    p->errhandler = NULL;
    if (NULL != p->info) {
        PMIX_INFO_FREE(p->info, p->ninfo);
    }
}
Пример #5
0
static void localcbfunc(pmix_status_t status, void *cbdata)
{
    pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;

    PMIX_INFO_FREE(cd->directives, cd->ndirs);
    if (NULL != cd->cbfunc.opcbfn) {
        cd->cbfunc.opcbfn(status, cd->cbdata);
    }
    PMIX_RELEASE(cd);
}
Пример #6
0
static void relcbfunc(void *cbdata)
{
    pmix_shift_caddy_t *cd = (pmix_shift_caddy_t*)cbdata;

    pmix_output_verbose(2, pmix_globals.debug_output,
                        "pmix:query release callback");

    if (NULL != cd->info) {
        PMIX_INFO_FREE(cd->info, cd->ninfo);
    }
    PMIX_RELEASE(cd);
}
Пример #7
0
void pmix_errhandler_invoke(pmix_status_t status,
                            pmix_proc_t procs[], size_t nprocs,
                            pmix_info_t info[], size_t ninfo)
{
    /* We need to parse thru each registered handler and determine
     * which one to call for the specific error */
    int i, idflt;
    size_t j;
    bool fired = false;
    pmix_error_reg_info_t *errreg, *errdflt=NULL;
    pmix_info_t *iptr;

    PMIX_INFO_CREATE(iptr, ninfo+1);
    (void)strncpy(iptr[0].key, PMIX_ERROR_HANDLER_ID, PMIX_MAX_KEYLEN);
    iptr[0].value.type = PMIX_INT;
    if (NULL != info) {
        for (j=0; j < ninfo; j++) {
            PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
        }
    }

    for (i = 0; i < pmix_globals.errregs.size; i++) {
        if (NULL == (errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item(&pmix_globals.errregs, i))) {
            continue;
        }
        if (NULL == errreg->info || 0 == errreg->ninfo) {
            // this is a general err handler - we will call it if there is no better match
            errdflt = errreg;
            idflt = i;
            continue;
        }
        iptr[0].value.data.integer = i;
        /* match error name key first */
        for (j = 0; j < errreg->ninfo; j++) {
            if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) &&
                (status == errreg->info[j].value.data.int32)) {
                    iptr[0].value.data.integer = i;
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                    break;
            }
        }
    }

    /* if nothing fired and we found a general err handler, then fire it */
    if (!fired && NULL != errdflt) {
        iptr[0].value.data.integer = idflt;
        errdflt->errhandler(status, procs, nprocs, iptr, ninfo+1);
    }
    /* cleanup */
    PMIX_INFO_FREE(iptr, ninfo+1);
}
Пример #8
0
static void nsdes(pmix_nspace_t *p)
{
    if (NULL != p->nspace) {
        free(p->nspace);
    }
    if (NULL != p->jobinfo) {
        PMIX_INFO_FREE(p->jobinfo, p->njobinfo);
    }
    if (NULL != p->jobbkt) {
        PMIX_RELEASE(p->jobbkt);
    }
    PMIX_LIST_DESTRUCT(&p->ranks);
}
Пример #9
0
static void opdes(pmix1_opcaddy_t *p)
{
    if (NULL != p->procs) {
        PMIX_PROC_FREE(p->procs, p->nprocs);
    }
    if (NULL != p->error_procs) {
        PMIX_PROC_FREE(p->error_procs, p->nerror_procs);
    }
    if (NULL != p->info) {
        PMIX_INFO_FREE(p->info, p->sz);
    }
    if (NULL != p->apps) {
        PMIX_APP_FREE(p->apps, p->sz);
    }
}
Пример #10
0
static void set_namespace(int nprocs, char *ranks, char *name)
{
    size_t ninfo;
    pmix_info_t *info;
    ninfo = 8;
    char *regex, *ppn;

    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, PMIX_UNIV_SIZE, PMIX_MAX_KEYLEN);
    info[0].value.type = PMIX_UINT32;
    info[0].value.data.uint32 = nprocs;

    (void)strncpy(info[1].key, PMIX_SPAWNED, PMIX_MAX_KEYLEN);
    info[1].value.type = PMIX_UINT32;
    info[1].value.data.uint32 = 0;

    (void)strncpy(info[2].key, PMIX_LOCAL_SIZE, PMIX_MAX_KEYLEN);
    info[2].value.type = PMIX_UINT32;
    info[2].value.data.uint32 = nprocs;

    (void)strncpy(info[3].key, PMIX_LOCAL_PEERS, PMIX_MAX_KEYLEN);
    info[3].value.type = PMIX_STRING;
    info[3].value.data.string = strdup(ranks);

    PMIx_generate_regex(NODE_NAME, &regex);
    (void)strncpy(info[4].key, PMIX_NODE_MAP, PMIX_MAX_KEYLEN);
    info[4].value.type = PMIX_STRING;
    info[4].value.data.string = regex;

    PMIx_generate_ppn(ranks, &ppn);
    (void)strncpy(info[5].key, PMIX_PROC_MAP, PMIX_MAX_KEYLEN);
    info[5].value.type = PMIX_STRING;
    info[5].value.data.string = ppn;

    (void)strncpy(info[6].key, PMIX_JOB_SIZE, PMIX_MAX_KEYLEN);
    info[6].value.type = PMIX_UINT32;
    info[6].value.data.uint32 = nprocs;

    (void)strncpy(info[7].key, PMIX_APPNUM, PMIX_MAX_KEYLEN);
    info[7].value.type = PMIX_UINT32;
    info[7].value.data.uint32 = getpid ();

    int in_progress = 1, rc;
    if (PMIX_SUCCESS == (rc = PMIx_server_register_nspace(name, nprocs, info, ninfo, release_cb, &in_progress))) {
        PMIX_WAIT_FOR_COMPLETION(in_progress);
    }
    PMIX_INFO_FREE(info, ninfo);
}
Пример #11
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_info_t *info;
    size_t ninfo;

    /* init us */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
        fprintf(stderr, "PMIx_tool_init failed: %s\n", PMIx_Error_string(rc));
        exit(rc);
    }
    fprintf(stderr, "Tool ns %s rank %d: Running\n", myproc.nspace, myproc.rank);

    /* query something */
    ninfo = 1;
    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, PMIX_QUERY_NAMESPACES, PMIX_MAX_KEYLEN);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info(info, ninfo))) {
        fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    if (0 != strncmp(info[0].key, PMIX_QUERY_NAMESPACES, PMIX_MAX_KEYLEN)) {
        fprintf(stderr, "tool ns %s rank %d: PMIx_Query_info key[0] wrong: %s vs %s\n",
                    myproc.nspace, myproc.rank, info[0].key, PMIX_QUERY_NAMESPACES);
    }
    if (PMIX_STRING != info[0].value.type) {
        fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info key[0] wrong type: %d vs %d\n",
                    myproc.nspace, myproc.rank, info[0].value.type, PMIX_STRING);
    }
    fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info key[0] returned %s\n",
            myproc.nspace, myproc.rank,
            (NULL == info[0].value.data.string) ? "NULL" : info[0].value.data.string);
    PMIX_INFO_FREE(info, ninfo);

 done:
    /* finalize us */
    fprintf(stderr, "Tool ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_tool_finalize())) {
        fprintf(stderr, "Tool ns %s rank %d:PMIx_tool_finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Tool ns %s rank %d:PMIx_tool_finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(rc);
}
Пример #12
0
pmix_status_t pmix_add_errhandler(pmix_notification_fn_t err,
                                  pmix_info_t *info, int ninfo,
                                  int *index)
{
    int i;
    pmix_error_reg_info_t *errreg;

    if (0 != *index) {
        /* overwrite an existing entry */
        errreg = (pmix_error_reg_info_t*)pmix_pointer_array_get_item(&pmix_globals.errregs, *index);
        if (NULL == errreg) {
            return PMIX_ERR_NOT_FOUND;
        }
        errreg->errhandler = err;
        PMIX_INFO_FREE(errreg->info, errreg->ninfo);
        errreg->ninfo = ninfo;
    } else {
        errreg = PMIX_NEW(pmix_error_reg_info_t);
        errreg->errhandler = err;
        errreg->ninfo = ninfo;
        *index = pmix_pointer_array_add(&pmix_globals.errregs, errreg);
        pmix_output_verbose(2, pmix_globals.debug_output,
                            "pmix_add_errhandler index =%d", *index);
        if (*index < 0) {
            PMIX_RELEASE(errreg);
            return PMIX_ERROR;
        }
    }
    /* sadly, we have to copy the info objects as we cannot
     * rely on them to remain in-memory */
    if (NULL != info && 0 < ninfo) {
        PMIX_INFO_CREATE(errreg->info, ninfo);
        for (i=0; i < ninfo; i++) {
            /* if this is a specific, single errhandler, then
             * mark it accordingly */
            if (0 == strncmp(info[i].key, PMIX_ERROR_NAME, PMIX_MAX_KEYLEN)) {
                errreg->sglhdlr = true;
            }
            (void)strncpy(errreg->info[i].key, info[i].key, PMIX_MAX_KEYLEN);
            pmix_value_xfer(&errreg->info[i].value, &info[i].value);
        }
    }

    return PMIX_SUCCESS;
}
Пример #13
0
static void pcdes(pmix_pending_connection_t *p)
{
    if (NULL != p->info) {
        PMIX_INFO_FREE(p->info, p->ninfo);
    }
    if (NULL != p->bfrops) {
        free(p->bfrops);
    }
    if (NULL != p->psec) {
        free(p->psec);
    }
    if (NULL != p->gds) {
        free(p->gds);
    }
    if (NULL != p->cred) {
        free(p->cred);
    }
}
Пример #14
0
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    char *tmp;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    int cnt, j;
    bool doabort = false;
    volatile bool active;
    pmix_info_t info, *iptr;
    size_t ninfo;
    pmix_status_t code;

    if (1 < argc) {
        if (0 == strcmp("-abort", argv[1])) {
            doabort = true;
        }
    }

    /* init us and declare we are a test programming model */
    PMIX_INFO_CREATE(iptr, 2);
    PMIX_INFO_LOAD(&iptr[0], PMIX_PROGRAMMING_MODEL, "TEST", PMIX_STRING);
    PMIX_INFO_LOAD(&iptr[1], PMIX_MODEL_LIBRARY_NAME, "PMIX", PMIX_STRING);
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, iptr, 2))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    PMIX_INFO_FREE(iptr, 2);
    pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank);

    /* test something */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    PMIX_VALUE_RELEASE(val);

    /* test something */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_SERVER_URI, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    pmix_output(0, "CLIENT SERVER URI: %s", val->data.string);
    PMIX_VALUE_RELEASE(val);

    /* register a handler specifically for when models declare */
    active = true;
    ninfo = 1;
    PMIX_INFO_CREATE(iptr, ninfo);
    PMIX_INFO_LOAD(&iptr[0], PMIX_EVENT_HDLR_NAME, "SIMPCLIENT-MODEL", PMIX_STRING);
    code = PMIX_MODEL_DECLARED;
    PMIx_Register_event_handler(&code, 1, iptr, ninfo,
                                model_callback, model_registration_callback, (void*)&active);
    while (active) {
        usleep(10);
    }
    PMIX_INFO_FREE(iptr, ninfo);

    /* register our errhandler */
    active = true;
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, errhandler_reg_callbk, (void*)&active);
    while (active) {
        usleep(10);
    }


    /* get our universe size */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs);

    /* put a few values */
    (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank);
    value.type = PMIX_UINT32;
    value.data.uint32 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Store_internal failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        goto done;
    }

    for (cnt=0; cnt < MAXCNT; cnt++) {
        (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, myproc.rank, cnt);
        value.type = PMIX_UINT64;
        value.data.uint64 = 1234;
        if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s",
                        myproc.nspace, myproc.rank, PMIx_Error_string(rc));
            goto done;
        }

        (void)asprintf(&tmp, "%s-%d-remote-%d", myproc.nspace, myproc.rank, cnt);
        value.type = PMIX_STRING;
        value.data.string = "1234";
        if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s",
                        myproc.nspace, myproc.rank, PMIx_Error_string(rc));
            goto done;
        }

        if (PMIX_SUCCESS != (rc = PMIx_Commit())) {
            pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Commit failed: %s",
                        myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc));
            goto done;
        }

        /* call fence to ensure the data is received */
        PMIX_PROC_CONSTRUCT(&proc);
        (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
        proc.rank = PMIX_RANK_WILDCARD;
        if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
            pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Fence failed: %s",
                        myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc));
            goto done;
        }

        /* check the returned data */
        (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
        for (j=0; j <= cnt; j++) {
            for (n=0; n < nprocs; n++) {
                proc.rank = n;
                (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, n, j);
                if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s failed: %s",
                                myproc.nspace, myproc.rank, j, tmp, PMIx_Error_string(rc));
                    continue;
                }
                if (NULL == val) {
                    pmix_output(0, "Client ns %s rank %d: NULL value returned",
                                myproc.nspace, myproc.rank);
                    break;
                }
                if (PMIX_UINT64 != val->type) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong type: %d", myproc.nspace, myproc.rank, j, tmp, val->type);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                    continue;
                }
                if (1234 != val->data.uint64) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong value: %d", myproc.nspace, myproc.rank, j, tmp, (int)val->data.uint64);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                    continue;
                }
                pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
                PMIX_VALUE_RELEASE(val);
                free(tmp);

                if (n != myproc.rank) {
                    (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j);
                    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
                        /* this data should _not_ be found as we are on the same node
                         * and the data was "put" with a PMIX_REMOTE scope */
                        pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
                        continue;
                    }
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc",
                                myproc.nspace, myproc.rank, j, tmp);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                }
            }
        }
    }

    /* now get the data blob for myself */
    pmix_output(0, "Client ns %s rank %d testing internal modex blob",
                myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS == (rc = PMIx_Get(&myproc, NULL, NULL, 0, &val))) {
        if (PMIX_DATA_ARRAY != val->type) {
            pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob",
                        myproc.nspace, myproc.rank);
            PMIX_VALUE_RELEASE(val);
        } else if (PMIX_INFO != val->data.darray->type) {
            pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO",
                        myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type));
            PMIX_VALUE_RELEASE(val);
        } else if (0 == val->data.darray->size) {
            pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length",
                        myproc.nspace, myproc.rank);
            PMIX_VALUE_RELEASE(val);
        } else {
            pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array;
            for (n=0; n < val->data.darray->size; n++) {
                pmix_output(0, "\tKey: %s", iptr[n].key);
            }
            PMIX_VALUE_RELEASE(val);
        }
    } else {
        pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc);
    }

    /* log something */
    PMIX_INFO_CONSTRUCT(&info);
    PMIX_INFO_LOAD(&info, PMIX_LOG_STDERR, "test log msg", PMIX_STRING);
    active = true;
    rc = PMIx_Log_nb(&info, 1, NULL, 0, opcbfunc, (void*)&active);
    if (PMIX_SUCCESS != rc) {
        pmix_output(0, "Client ns %s rank %d - log_nb returned %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
    } else {
        while (active) {
            usleep(10);
        }
    }
    PMIX_INFO_DESTRUCT(&info);

    /* if requested and our rank is 0, call abort */
    if (doabort) {
        if (0 == myproc.rank) {
            PMIx_Abort(PMIX_ERR_PROC_REQUESTED_ABORT, "CALLING ABORT", NULL, 0);
        } else {
            while(!completed) {
                usleep(10);
            }
        }
    }

 done:
    /* finalize us */
    pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %s\n",
                myproc.nspace, myproc.rank, PMIx_Error_string(rc));
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(rc);
}
Пример #15
0
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    pmix_proc_t proc;
    uint32_t nprocs;
    pmix_info_t *info;
    pmix_pdata_t *pdata;
    pmix_proc_t myproc;

    /* init us */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %d", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank);

    /* get our universe size */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs);

    /* call fence to ensure the data is received */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* publish something */
    if (0 == myproc.rank) {
        PMIX_INFO_CREATE(info, 2);
        (void)strncpy(info[0].key, "FOOBAR", PMIX_MAX_KEYLEN);
        info[0].value.type = PMIX_UINT8;
        info[0].value.data.uint8 = 1;
        (void)strncpy(info[1].key, "PANDA", PMIX_MAX_KEYLEN);
        info[1].value.type = PMIX_SIZE;
        info[1].value.data.size = 123456;
        if (PMIX_SUCCESS != (rc = PMIx_Publish(info, 2))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Publish failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        PMIX_INFO_FREE(info, 2);
    }

    /* call fence again so all procs know the data
     * has been published */
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* lookup something */
    if (0 != myproc.rank) {
        PMIX_PDATA_CREATE(pdata, 1);
        (void)strncpy(pdata[0].key, "FOOBAR", PMIX_MAX_KEYLEN);
        if (PMIX_SUCCESS != (rc = PMIx_Lookup(pdata, 1, NULL, 0))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Lookup failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        /* check the return for value and source */
        if (0 != strncmp(myproc.nspace, pdata[0].proc.nspace, PMIX_MAX_NSLEN)) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Lookup returned wrong nspace: %s",
                        myproc.nspace, myproc.rank, pdata[0].proc.nspace);
            goto done;
        }
        if (0 != pdata[0].proc.rank) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Lookup returned wrong rank: %d",
                        myproc.nspace, myproc.rank, pdata[0].proc.rank);
            goto done;
        }
        if (PMIX_UINT8 != pdata[0].value.type) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Lookup returned wrong type: %d",
                        myproc.nspace, myproc.rank, pdata[0].value.type);
            goto done;
        }
        if (1 != pdata[0].value.data.uint8) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Lookup returned wrong value: %d",
                        myproc.nspace, myproc.rank, (int)pdata[0].value.data.uint8);
            goto done;
        }
        PMIX_PDATA_FREE(pdata, 1);
        pmix_output(0, "PUBLISH-LOOKUP SUCCEEDED");
    }

    /* call fence again so rank 0 waits before leaving */
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    if (0 == myproc.rank) {
        char **keys = NULL;
        pmix_argv_append_nosize(&keys, "FOOBAR");
        pmix_argv_append_nosize(&keys, "PANDA");

        if (PMIX_SUCCESS != (rc = PMIx_Unpublish(keys, NULL, 0))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Unpublish failed: %d", myproc.nspace, myproc.rank, rc);
            goto done;
        }
        pmix_output(0, "UNPUBLISH SUCCEEDED");
    }

    /* call fence again so everyone waits for rank 0 before leaving */
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Fence failed: %d", myproc.nspace, myproc.rank, rc);
        goto done;
    }

 done:
    /* finalize us */
    pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize())) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
Пример #16
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_info_t *info;
    pmix_app_t *app;
    size_t ninfo, napps;

    /* check for user directives - this would include:
     * - a flag indicating we want to attach to a specified application
     * - application info if we are launching a new app
     */

    /* init us - if a PMIx server pid was provided, then pass it along */
    if (0 < server_pid) {
        ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        PMIX_INFO_LOAD(&info[0], PMIX_SERVER_PIDINFO, server_pid, PMIX_UINT32);
    } else {
        info = NULL;
        ninfo = 0;
    }
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) {
        fprintf(stderr, "PMIx_tool_init failed: %d\n", rc);
        exit(rc);
    }
    if (0 < ninfo) {
        PMIX_INFO_FREE(info, ninfo);
    }
    fprintf(stderr, "Tool ns %s rank %d: Running\n", myproc.nspace, myproc.rank);

    /* if we are attaching to a running job, then attach to it */
    if (attach) {
        ret = attach_to_running_job(argv[1]);
    } else {
        /* this is an initial launch - we need to launch the application
         * plus the debugger daemons, letting the RM know we are debugging
         * so that it will "pause" the app procs until we are ready */
        napps = 2;
        PMIX_APP_CREATE(app, napps);
        /* setup the executable */
        app[0].cmd = strdup("app");
        app[0].argc = 1;
        app[0].argv = (char**)malloc(2*sizeof(char*));
        app[0].argv[0] = strdup("app");
        app[0].argv[1] = NULL;
        /* provide directives so the apps do what the user requested */
        ninfo = 2;
        PMIX_INFO_CREATE(app[0].info, ninfo);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_NP, 128, PMIX_UINT64);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "slot", PMIX_STRING);

        /* setup the name of the daemon executable to launch */
        app[1].cmd = strdup("debuggerdaemon");
        app[1].argc = 1;
        app[1].argv = (char**)malloc(2*sizeof(char*));
        app[1].argv[0] = strdup("debuggerdaemon");
        app[1].argv[1] = NULL;
        /* provide directives so the daemons go where we want, and
         * let the RM know these are debugger daemons */
        ninfo = 2;
        PMIX_INFO_CREATE(app[1].info, ninfo);
        PMIX_INFO_LOAD(&app[1].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
        PMIX_INFO_LOAD(&app[1].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
        /* spawn the daemons */
        PMIx_Spawn(NULL, 0, app, napps, dspace);
        /* cleanup */
        PMIX_APP_FREE(app, napps);

        /* this is where a debugger tool would wait until the debug operation is complete */
    }


 done:
    PMIx_tool_finalize(NULL, 0);

    return(ret);
}
Пример #17
0
void pmix_errhandler_invoke(pmix_status_t status,
                            pmix_proc_t procs[], size_t nprocs,
                            pmix_info_t info[], size_t ninfo)
{
    /* We need to parse thru each registered handler and determine
     * which one to call for the specific error */
    int i, idflt;
    size_t j, k;
    bool fired = false;
    bool exact_match;
    pmix_error_reg_info_t *errreg, *errdflt=NULL;
    pmix_info_t *iptr;

    /* we will need to provide the errhandler reference id when
     * we provide the callback. Since the callback function doesn't
     * provide a param for that purpose, we have to add it to any
     * info array that came from the RM, so extend the array by 1 */
    PMIX_INFO_CREATE(iptr, ninfo+1);
    /* put the reference id in the first location */
    (void)strncpy(iptr[0].key, PMIX_ERROR_HANDLER_ID, PMIX_MAX_KEYLEN);
    iptr[0].value.type = PMIX_INT;
    /* we don't know the reference id yet, but we'll fill that in
     * later - for now, just copy the incoming info array across */
    if (NULL != info) {
        for (j=0; j < ninfo; j++) {
            PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
        }
    }

    /* search our array of errhandlers for a match. We take any specific
     * error status first, then take the group of the incoming status next.
     * If neither of those have been registered, then use any default
     * errhandler - otherwise, ignore it */
    for (i = 0; i < pmix_globals.errregs.size; i++) {
        if (NULL == (errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item(&pmix_globals.errregs, i))) {
            continue;
        }
        if (NULL == errreg->info || 0 == errreg->ninfo) {
            // this is a general err handler - we will call it if there is no better match
            errdflt = errreg;
            idflt = i;
            continue;
        }
        iptr[0].value.data.integer = i;
        /* match error name key first */
        exact_match = false;
        for (j = 0; j < errreg->ninfo; j++) {
            if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) &&
                (status == errreg->info[j].value.data.int32)) {
                    iptr[0].value.data.integer = i;
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                    exact_match = true;
                    break;
            }
        }
        if (!exact_match && NULL != info) {
            /* if no exact match was found, then we will fire the errhandler
             * for any matching info key. This may be too lax and need to be adjusted
             * later */
            for (k = 0; k < errreg->ninfo; k++) {
                if ((0 == strcmp(errreg->info[j].key, info[k].key)) &&
                    (pmix_value_cmp(&errreg->info[j].value, &info[k].value))) {
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                }
            }
        }
    }

    /* if nothing fired and we found a general err handler, then fire it */
    if (!fired && NULL != errdflt) {
        iptr[0].value.data.integer = idflt;
        errdflt->errhandler(status, procs, nprocs, iptr, ninfo+1);
    }
    /* cleanup */
    PMIX_INFO_FREE(iptr, ninfo+1);
}
Пример #18
0
PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata,
                                      const pmix_info_t directives[], size_t ndirs,
                                      pmix_op_cbfunc_t cbfunc, void *cbdata)

{
    pmix_shift_caddy_t *cd;
    pmix_cmd_t cmd = PMIX_LOG_CMD;
    pmix_buffer_t *msg;
    pmix_status_t rc;
    size_t n;
    time_t timestamp = 0;
    pmix_proc_t *source = NULL;

    PMIX_ACQUIRE_THREAD(&pmix_global_lock);

    pmix_output_verbose(2, pmix_globals.debug_output,
                        "pmix:log non-blocking");

    if (pmix_globals.init_cntr <= 0) {
        PMIX_RELEASE_THREAD(&pmix_global_lock);
        return PMIX_ERR_INIT;
    }

    if (0 == ndata || NULL == data) {
        PMIX_RELEASE_THREAD(&pmix_global_lock);
        return PMIX_ERR_BAD_PARAM;
    }

    /* check the directives - if they requested a timestamp, then
     * get the time, also look for a source */
    if (NULL != directives) {
        for (n=0; n < ndirs; n++) {
            if (0 == strncmp(directives[n].key, PMIX_LOG_GENERATE_TIMESTAMP, PMIX_MAX_KEYLEN)) {
                if (PMIX_INFO_TRUE(&directives[n])) {
                    /* pickup the timestamp */
                    timestamp = time(NULL);
                }
            } else if (0 == strncmp(directives[n].key, PMIX_LOG_SOURCE, PMIX_MAX_KEYLEN)) {
                source = directives[n].value.data.proc;
            }
        }
    }

    /* if we are a client or tool, we never do this ourselves - we
     * always pass this request to our server for execution */
    if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) &&
        !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) {
        /* if we aren't connected, don't attempt to send */
        if (!pmix_globals.connected) {
            PMIX_RELEASE_THREAD(&pmix_global_lock);
            return PMIX_ERR_UNREACH;
        }
        PMIX_RELEASE_THREAD(&pmix_global_lock);

        /* if we are not a server, then relay this request to the server */
        cd = PMIX_NEW(pmix_shift_caddy_t);
        cd->cbfunc.opcbfn = cbfunc;
        cd->cbdata = cbdata;
        msg = PMIX_NEW(pmix_buffer_t);
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &cmd, 1, PMIX_COMMAND);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        /* provide the timestamp - zero will indicate
         * that it wasn't taken */
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &timestamp, 1, PMIX_TIME);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        /* pack the number of data entries */
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &ndata, 1, PMIX_SIZE);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        if (0 < ndata) {
            PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                             msg, data, ndata, PMIX_INFO);
            if (PMIX_SUCCESS != rc) {
                PMIX_ERROR_LOG(rc);
                PMIX_RELEASE(msg);
                PMIX_RELEASE(cd);
                return rc;
            }
        }
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &ndirs, 1, PMIX_SIZE);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        if (0 < ndirs) {
            PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                             msg, directives, ndirs, PMIX_INFO);
            if (PMIX_SUCCESS != rc) {
                PMIX_ERROR_LOG(rc);
                PMIX_RELEASE(msg);
                PMIX_RELEASE(cd);
                return rc;
            }
        }

        pmix_output_verbose(2, pmix_plog_base_framework.framework_output,
                            "pmix:log sending to server");
        PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver,
                           msg, log_cbfunc, (void*)cd);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(cd);
        }
        return rc;
    }
    PMIX_RELEASE_THREAD(&pmix_global_lock);

    /* if no recorded source was found, then we must be it */
    if (NULL == source) {
        source = &pmix_globals.myid;
        cd = PMIX_NEW(pmix_shift_caddy_t);
        cd->cbfunc.opcbfn = cbfunc;
        cd->cbdata = cbdata;
        cd->ndirs = ndirs + 1;
        PMIX_INFO_CREATE(cd->directives, cd->ndirs);
        for (n=0; n < ndirs; n++) {
            PMIX_INFO_XFER(&cd->directives[n], (pmix_info_t*)&directives[n]);
        }
        PMIX_INFO_LOAD(&cd->directives[ndirs], PMIX_LOG_SOURCE, &source, PMIX_PROC);
        /* call down to process the request - the various components
         * will thread shift as required */
        rc = pmix_plog.log(source, data, ndata, cd->directives, cd->ndirs, localcbfunc, cd);
        if (PMIX_SUCCESS != rc) {
            PMIX_INFO_FREE(cd->directives, cd->ndirs);
            PMIX_RELEASE(cd);
        }
    } else if (0 == strncmp(source->nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN) &&
               source->rank == pmix_globals.myid.rank) {
        /* if I am the recorded source, then this is a re-submission of
         * something that got "upcalled" by a prior call. In this case,
         * we return a "not supported" error as clearly we couldn't
         * handle it, and neither could our host */
        rc = PMIX_ERR_NOT_SUPPORTED;
    } else {
        /* call down to process the request - the various components
         * will thread shift as required */
        rc = pmix_plog.log(source, data, ndata, directives, ndirs, cbfunc, cbdata);
    }

    return rc;
}
Пример #19
0
static void rollup(int status, orte_process_name_t* sender,
                   opal_buffer_t *buffer,
                   orte_rml_tag_t tag, void *cbdata)
{
    int ret;
    orte_process_name_t child;
    int32_t flag, cnt;
    opal_byte_object_t *boptr;
    pmix_data_buffer_t pbkt;
    pmix_info_t *info;
    pmix_proc_t proc;
    pmix_status_t prc;

    ncollected++;

    /* if the sender is ourselves, then we save that buffer
     * so we can insert it at the beginning */
    if (sender->jobid == ORTE_PROC_MY_NAME->jobid &&
        sender->vpid == ORTE_PROC_MY_NAME->vpid) {
        mybucket = OBJ_NEW(opal_buffer_t);
        opal_dss.copy_payload(mybucket, buffer);
    } else {
        /* xfer the contents of the rollup to our bucket */
        opal_dss.copy_payload(bucket, buffer);
        /* the first entry in the bucket will be from our
         * direct child - harvest it for connection info */
        cnt = 1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &child, &cnt, ORTE_NAME))) {
            ORTE_ERROR_LOG(ret);
            goto report;
        }
        cnt = 1;
        if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &flag, &cnt, OPAL_INT32))) {
            ORTE_ERROR_LOG(ret);
            goto report;
        }
        if (0 < flag) {
            (void)opal_snprintf_jobid(proc.nspace, PMIX_MAX_NSLEN, sender->jobid);
            proc.rank = sender->vpid;
            /* we have connection info */
            cnt = 1;
            if (ORTE_SUCCESS != (ret = opal_dss.unpack(buffer, &boptr, &cnt, OPAL_BYTE_OBJECT))) {
                ORTE_ERROR_LOG(ret);
                goto report;
            }
            /* it was packed using PMIx, so unpack it the same way */
            PMIX_DATA_BUFFER_LOAD(&pbkt, boptr->bytes, boptr->size);
            PMIX_INFO_CREATE(info, (size_t)flag);
            if (PMIX_SUCCESS != (prc = PMIx_Data_unpack(&proc, &pbkt, (void*)info, &flag, PMIX_INFO))) {
                PMIX_ERROR_LOG(prc);
                goto report;
            }
            for (cnt=0; cnt < flag; cnt++) {
                prc = PMIx_Store_internal(&proc, PMIX_PROC_URI, &info[cnt].value);
                if (PMIX_SUCCESS != prc) {
                    PMIX_ERROR_LOG(prc);
                    PMIX_INFO_FREE(info, (size_t)flag);
                    goto report;
                }
            }
            PMIX_INFO_FREE(info, (size_t)flag);
        }
    }

  report:
    report_orted();
}
Пример #20
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t *info;
    size_t ninfo;
    pmix_query_t *query;
    size_t nq, n;
    myquery_data_t myquery_data;
    pid_t pid;
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;
    mylock_t mylock;
    myrel_t myrel;
    uint16_t localrank;
    char *target = NULL;

    pid = getpid();

    /* init us - since we were launched by the RM, our connection info
     * will have been provided at startup. */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
        fprintf(stderr, "Debugger daemon: PMIx_tool_init failed: %d\n", rc);
        exit(0);
    }
    fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid);


    /* register our default event handler */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    if (PMIX_SUCCESS != mylock.status) {
        rc = mylock.status;
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_DESTRUCT_LOCK(&mylock);

    /* get the nspace of the job we are to debug - it will be in our JOB info */
#ifdef PMIX_LOAD_PROCID
    PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
#else
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_KEYLEN);
    proc.rank = PMIX_RANK_WILDCARD;
#endif
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - error %s\n",
                myproc.nspace, myproc.rank,
                (unsigned long)pid, PMIx_Error_string(rc));
        goto done;
    }
    if (NULL == val || PMIX_STRING != val->type || NULL == val->data.string) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - NULL data returned\n",
                myproc.nspace, myproc.rank, (unsigned long)pid);
        goto done;
    }
    /* save it for later */
    target = strdup(val->data.string);
    PMIX_VALUE_RELEASE(val);

    fprintf(stderr, "[%s:%d:%lu] Debugging %s\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, target);

    /* get my local rank so I can determine which local proc is "mine"
     * to debug */
    val = NULL;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_LOCAL_RANK, NULL, 0, &val))) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - error %s\n",
                myproc.nspace, myproc.rank,
                (unsigned long)pid, PMIx_Error_string(rc));
        goto done;
    }
    if (NULL == val) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - NULL data returned\n",
                myproc.nspace, myproc.rank, (unsigned long)pid);
        goto done;
    }
    if (PMIX_UINT16 != val->type) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - returned wrong type %s\n",
                myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Data_type_string(val->type));
        goto done;
    }
    /* save the data */
    localrank = val->data.uint16;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "[%s:%d:%lu] my local rank %d\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, (int)localrank);

    /* register another handler specifically for when the target
     * job completes */
    DEBUG_CONSTRUCT_LOCK(&myrel.lock);
    myrel.nspace = strdup(proc.nspace);
    PMIX_INFO_CREATE(info, 2);
    PMIX_INFO_LOAD(&info[0], PMIX_EVENT_RETURN_OBJECT, &myrel, PMIX_POINTER);
    /* only call me back when this specific job terminates */
    PMIX_LOAD_PROCID(&proc, target, PMIX_RANK_WILDCARD);
    PMIX_INFO_LOAD(&info[1], PMIX_EVENT_AFFECTED_PROC, &proc, PMIX_PROC);

    fprintf(stderr, "[%s:%d:%lu] registering for termination of %s\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, proc.nspace);


    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(&code, 1, info, 2,
                                release_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    if (PMIX_SUCCESS != mylock.status) {
        rc = mylock.status;
        DEBUG_DESTRUCT_LOCK(&mylock);
        PMIX_INFO_FREE(info, 2);
        goto done;
    }
    DEBUG_DESTRUCT_LOCK(&mylock);
    PMIX_INFO_FREE(info, 2);

    /* get our local proctable - for scalability reasons, we don't want to
     * have our "root" debugger process get the proctable for everybody and
     * send it out to us. So ask the local PMIx server for the pid's of
     * our local target processes */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE);
    query[0].nqual = 1;
    PMIX_INFO_CREATE(query[0].qualifiers, 1);
    PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, target, PMIX_STRING);  // the nspace we are enquiring about
    /* setup the caddy to retrieve the data */
    DEBUG_CONSTRUCT_LOCK(&myquery_data.lock);
    myquery_data.info = NULL;
    myquery_data.ninfo = 0;
    /* execute the query */
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
        fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
        goto done;
    }
    DEBUG_WAIT_THREAD(&myquery_data.lock);
    DEBUG_DESTRUCT_LOCK(&myquery_data.lock);
    PMIX_QUERY_FREE(query, nq);
    if (PMIX_SUCCESS != myquery_data.status) {
        rc = myquery_data.status;
        goto done;
    }

    fprintf(stderr, "[%s:%d:%lu] Local proctable received\n", myproc.nspace, myproc.rank, (unsigned long)pid);


    /* now that we have the proctable for our local processes, we can do our
     * magic debugger stuff and attach to them. We then send a "release" event
     * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We
     * do this with the event notification system. For this example, we just
     * send it to all local procs of the job being debugged */
    (void)strncpy(proc.nspace, target, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    ninfo = 2;
    PMIX_INFO_CREATE(info, ninfo);
    PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC);  // deliver to the target nspace
    PMIX_INFO_LOAD(&info[1], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL);  // deliver to the target nspace
    fprintf(stderr, "[%s:%u:%lu] Sending release\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    rc = PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
                           NULL, PMIX_RANGE_CUSTOM,
                           info, ninfo, NULL, NULL);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "%s[%s:%u:%lu] Sending release failed with error %s(%d)\n",
                myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Error_string(rc), rc);
        goto done;
    }

    /* do some debugger magic while waiting for the job to terminate */
    DEBUG_WAIT_THREAD(&myrel.lock);

  done:
    if (NULL != target) {
        free(target);
    }
    /* finalize us */
    fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Finalizing\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    }
    fflush(stderr);
    return(0);
}
Пример #21
0
static int attach_to_running_job(char *nspace)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_query_t *query;
    size_t nq;
    myquery_data_t *q;

    /* query the active nspaces so we can verify that the
     * specified one exists */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_NAMESPACES);

    q = (myquery_data_t*)malloc(sizeof(myquery_data_t));
    DEBUG_CONSTRUCT_LOCK(&q->lock);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)q))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    DEBUG_WAIT_THREAD(&q->lock);
    DEBUG_DESTRUCT_LOCK(&q->lock);

    if (NULL == q->info) {
        fprintf(stderr, "Query returned no info\n");
        return -1;
    }
    /* the query should have returned a comma-delimited list of nspaces */
    if (PMIX_STRING != q->info[0].value.type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", q->info[0].value.type);
        return -1;
    }
    if (NULL == q->info[0].value.data.string) {
        fprintf(stderr, "Query returned no active nspaces\n");
        return -1;
    }

    fprintf(stderr, "Query returned %s\n", q->info[0].value.data.string);
    return 0;

#if 0
    /* split the returned string and look for the given nspace */

    /* if not found, then we have an error */
    PMIX_INFO_FREE(info, ninfo);

    /* get the proctable for this nspace */
    ninfo = 1;
    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, PMIX_QUERY_PROC_TABLE, PMIX_MAX_KEYLEN);
    (void)strncpy(info[0].qualifier, nspace, PMIX_MAX_KEYLEN);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(info, ninfo, infocbfunc, (void*)&active))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    /* wait to get a response */

    /* the query should have returned a data_array */
    if (PMIX_DATA_ARRAY != info[0].type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", info[0].type);
        return -1;
    }
    if (NULL == info[0].data.darray.array) {
        fprintf(stderr, "Query returned no proctable info\n");
        return -1;
    }
    /* the data array consists of a struct:
     *     size_t size;
     *     void* array;
     *
     * In this case, the array is composed of pmix_proc_info_t structs:
     *     pmix_proc_t proc;   // contains the nspace,rank of this proc
     *     char* hostname;
     *     char* executable_name;
     *     pid_t pid;
     *     int exit_code;
     *     pmix_proc_state_t state;
     */

    /* this is where a debugger tool would process the proctable to
     * create whatever blob it needs to provide to its daemons */
    PMIX_INFO_FREE(info, ninfo);

    /* setup the debugger daemon spawn request */
    napps = 1;
    PMIX_APP_CREATE(app, napps);
    /* setup the name of the daemon executable to launch */
    app[0].cmd = strdup("debuggerdaemon");
    app[0].argc = 1;
    app[0].argv = (char**)malloc(2*sizeof(char*));
    app[0].argv[0] = strdup("debuggerdaemon");
    app[0].argv[1] = NULL;
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    ninfo = 3;
    PMIX_INFO_CREATE(app[0].info, ninfo);
    PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&app[0].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&app[0].info[2], PMIX_DEBUG_TARGET, nspace, PMIX_STRING); // the "jobid" of the application to be debugged

    /* spawn the daemons */
    PMIx_Spawn(NULL, 0, app, napps, dspace);
    /* cleanup */
    PMIX_APP_FREE(app, napps);

    /* this is where a debugger tool would wait until the debug operation is complete */

    return 0;
#endif
}
Пример #22
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_info_t *info, *iptr;
    pmix_app_t *app;
    size_t ninfo, napps;
    char *nspace = NULL;
    int i;
    pmix_query_t *query;
    size_t nq, n;
    myquery_data_t myquery_data;
    bool cospawn = false, stop_on_exec = false, cospawn_reqd = false;
    char cwd[1024];
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;
    mylock_t mylock;
    myrel_t myrel, launcher_ready, dbrel;
    pid_t pid;
    pmix_envar_t envar;
    char *launchers[] = {
        "prun",
        "mpirun",
        "mpiexec",
        "orterun",
        NULL
    };
    pmix_proc_t proc;
    bool found;
    pmix_data_array_t darray;
    char *tmp;
    char clientspace[PMIX_MAX_NSLEN+1];

    pid = getpid();

    /* Process any arguments we were given */
    for (i=1; i < argc; i++) {
        if (0 == strcmp(argv[i], "-h") ||
            0 == strcmp(argv[i], "--help")) {
            /* print the usage message and exit */

        }
        if (0 == strcmp(argv[i], "-a") ||
            0 == strcmp(argv[i], "--attach")) {
            if (NULL != nspace) {
                /* can only support one */
                fprintf(stderr, "Cannot attach to more than one nspace\n");
                exit(1);
            }
            /* the next argument must be the nspace */
            ++i;
            if (argc == i) {
                /* they goofed */
                fprintf(stderr, "The %s option requires an <nspace> argument\n", argv[i]);
                exit(1);
            }
            nspace = strdup(argv[i]);
        } else if (0 == strcmp(argv[i], "-c") ||
                   0 == strcmp(argv[i], "--cospawn")){
            cospawn_reqd = true;
            break;
        }
    }
    info = NULL;
    ninfo = 0;

    /* use the system connection first, if available */
    PMIX_INFO_CREATE(info, 1);
    PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_SYSTEM_FIRST, NULL, PMIX_BOOL);
    /* init as a tool */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) {
        fprintf(stderr, "PMIx_tool_init failed: %s(%d)\n", PMIx_Error_string(rc), rc);
        exit(rc);
    }
    PMIX_INFO_FREE(info, ninfo);

    fprintf(stderr, "Debugger ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid);

    /* construct the debugger termination release */
    DEBUG_CONSTRUCT_LOCK(&dbrel.lock);

    /* register a default event handler */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    DEBUG_DESTRUCT_LOCK(&mylock);

    /* if we are attaching to a running job, then attach to it */
    if (NULL != nspace) {
        if (PMIX_SUCCESS != (rc = attach_to_running_job(nspace))) {
            fprintf(stderr, "Failed to attach to nspace %s: error code %d\n",
                    nspace, rc);
            goto done;
        }
    }


  done:
    DEBUG_DESTRUCT_LOCK(&myrel.lock);
    DEBUG_DESTRUCT_LOCK(&dbrel.lock);
    PMIx_tool_finalize();

    return(rc);
}
Пример #23
0
int main(int argc, char **argv, const char **environ)
{

        pmix_status_t rc;

        pmix_info_t *info = NULL;

        bool flag;

        pmix_status_t retval;
        pmix_app_t *spawned_app = NULL;

        pmix_info_t *job_info = NULL;
        pmix_info_t *proc_info = NULL;
        int job_info_count = 0;
        int job_info_index = 0;
        int proc_info_count = 0;
        int proc_info_index = 0;

        char spawned_nsp[PMIX_MAX_NSLEN+1];
        char *path_to_app = NULL;
        char *host_to_use = NULL;
        int number_of_clients = 0;
        int temp_counter = 0;
        done_flag = false;
        gethostname(hostn, 500);
        int spawned_app_argc = 0;

        char **scr_environ = NULL;

        int proc_count = 1;
        int node_count = 0;
        bool blocking_mode = true;
        char *node_list = NULL;
        bool forward_all_scr_envs = false;
        bool pmix_mode = false;


        const char *optstring = "+n:N:L:x:bB:pPvhe";
        int temp_slen=0;
        /* todo: add arg parsing with ompi schizo */
        verbose_print = false;

        int sleep_max = 30;
        const int fixed_sleep = 5;
        int c;
        while((c = getopt(argc, argv, optstring)) != -1){
                switch(c){
                case 'h':
                        print_usage(argv[0]);
                        exit(0);
                        break;
                case 'n':
                        proc_count = atoi(optarg);
                        if(proc_count <= 0 || proc_count > 100){
                                printf("outside the range of allowable instances to spawn [1-100]\n");
                                exit(1);
                        }
                        if(verbose_print) {
                                printf("proc_count = %d\n", proc_count);
                        }
                        break;
                case 'N':
                        /* node_count = atoi(optarg); */
                        node_count = 1;
                        if(verbose_print) {
                                printf("node_count = %d\n", node_count);
                        }
                        break;
                case 'B':
                        blocking_mode = true;
                        sleep_max = atoi(optarg);
                        if(sleep_max < 0){
                                printf("can't sleep for less than 0 seconds\n");
                                exit(1);
                        }
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'b':
                        blocking_mode = false;
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'L':
                        node_list = optarg;
                        host_to_use = node_list;
                        if(verbose_print){
                                printf("node_list = '%s'\n", node_list);
                        }
                        break;
                case 'x':
                        temp_slen = strlen(optarg);
                        /*  check if the string is the same length as 'SCR', if so compare them */
                        if(temp_slen == strlen(SCR_STRING)){
                                if(strncmp(optarg, SCR_STRING, strlen(SCR_STRING)) == 0){
                                        /* if the string is SCR, then forward all SCR related env vars */
                                        if(verbose_print) printf("all scr envs will be forwarded\n");
                                        forward_all_scr_envs = true;
                                }
                                else{
                                        /* handled like a normal env var */
                                        handle_standard_env_var(optarg, &scr_environ);
                                }
                        }
                        else{
                                /*handled like a normal env var */
                                handle_standard_env_var(optarg, &scr_environ);
                        }
                        break;
                case 'v':
                        verbose_print = true;
                        break;
                case 'p':
                        pmix_mode = true;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'P':
                        pmix_mode = false;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'e':
                        experimental = true;
                        break;
                case '?':
                        printf("missing a required argument or invalid option: %x\n", optopt);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                default:
                        printf("Unrecognized argument: %c\n", c);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                }
        }

        /* number of instances to spawn */
        number_of_clients = proc_count;


        /* check to make sure an application was specified to launch */
        if( optind < argc ){
                /* if optind is < argc, it means there is at least one more arg
                 * beyond the args for this program */
                path_to_app = argv[optind];
                spawned_app_argc = argc - optind;
                if(verbose_print) {
                        printf("app to launch: %s @ %s:%d\n",
                               path_to_app, __FILE__, __LINE__);
                }
        }
        else{
                printf("program_to_spawn option was not provded\n");
                print_usage(argv[0]);
                exit(1);
        }

        if(verbose_print){
                printf("master process will spawn %d instances; app to run: %s\n\n",
                       number_of_clients, path_to_app);
                printf("pmix version: %s (host: %s)\n", PMIx_Get_version(), hostn);
        }
        /* init pmix */
        retval = PMIx_Init(&main_proc, NULL, 0);
        if(retval != PMIX_SUCCESS){
                error_helper(retval, hostn, "error initializing pmix");
                exit(0);
        }

        if(verbose_print){
                printf("rank %d, host '%s', nspace: '%s' init'd pmix succesfully\n\n",
                       main_proc.rank, hostn, main_proc.nspace);
        }



        /* we need to attach to a "system" PMIx server so we
         * can ask it to spawn applications for us. There can
         * only be one such connection on a node, so we will
         * instruct the tool library to only look for it */
        int ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        flag = true;
        PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_TO_SYSTEM, &flag, PMIX_BOOL);

        /* initialize the library and make the connection */
        if (PMIX_SUCCESS != (rc = PMIx_tool_init(&tool_proc, NULL, 0 ))) {
                fprintf(stderr, "PMIx_tool_init failed: %d\n", rc );
                exit(rc);
        }
        if (0 < ninfo) {
                PMIX_INFO_FREE(info, ninfo);
        }




        /* first call fence to sync all processes */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS)
        {
                error_helper(retval, hostn, "error fencing");
                exit(retval);
        }

        /* Process SCR env vars if needed */

        if(forward_all_scr_envs){
                parse_all_scr_envs(&scr_environ, environ);
        }

        /* finalize the env array so a NULL is in place */
        finalize_array(scr_environ);

        /* Setup info structs to pass to this: */
        /* pmix_info_t *error_info = NULL; */
        /*  PMIX_INFO_CREATE(error_info, 1); */
        /*
          strncpy(error_info[0].key, PMIX_ERROR_GROUP_ABORT, PMIX_MAX_KEYLEN);
          error_info[0].value.type = PMIX_BOOL;
          error_info[0].value.data.flag = true;
        */

        /*  strncpy(error_info[0].key, PMIX_ERROR_GROUP_SPAWN, PMIX_MAX_KEYLEN);
            int t_val = 1;
            pmix_value_load(&error_info[1].value, &t_val, PMIX_BOOL);
        */

        /*error_info[1].value.type = PMIX_BOOL;
        error_info[1].value.data.flag = true; */

        /*  strncpy(error_info[2].key, PMIX_ERROR_GROUP_GENERAL, PMIX_MAX_KEYLEN);
            error_info[2].value.type = PMIX_BOOL;
            error_info[2].value.data.flag = true;
        */


        /* TODO: setup error handling when implemented in pmix with the
         * following error codes: */
        /*
        pmix_status_t registered_codes[5];
        registered_codes[0] = PMIX_ERR_JOB_TERMINATED;
        registered_codes[1] = PMIX_ERR_PROC_ABORTED;
        registered_codes[2] = PMIX_ERR_PROC_ABORTING;
        */
        PMIx_Register_event_handler(NULL, 0,
                                    NULL, 0,
                                    errhandler_cb,
                                    errhandler_reg_callbk,
                                    (void *) NULL);

        /*  PMIX_INFO_DESTRUCT(error_info); */

        /* allocate memory to hold the spawend app struct */
        PMIX_APP_CREATE(spawned_app, 1);

        /* maxprocs isn't documented very well, but it appears to control
         * how many instances of the spanwed app are created */
        spawned_app->maxprocs = number_of_clients;

        /* set the app to run */
        (void)asprintf(&spawned_app->cmd, "%s", path_to_app);

        /* set argv for spawned app starting with remaining argv  */
        spawned_app->argv = &argv[optind];

        /* set the environment pointer */
        spawned_app->env = scr_environ;

        /*--START: add all proc level infos */

        /* add things to the proc level info */
        if(!pmix_mode){
                job_info_count++;
        }

        if(host_to_use != NULL){
                proc_info_count++;
        }

        if(verbose_print){
                printf("enabling debug feature for forwarding stdout/stderr\n");
                proc_info_count+=2;
                /* add PMIX_FWD_STDOUT and PMIX_FWD_STDERR later*/

        }

        if(experimental){
                job_info_count++;
        }
        if(node_count == 1){
                job_info_count++;
        }
        /*--END: add all proc level infos */


        /*--START: append actual proc level info */
        PMIX_INFO_CREATE(job_info, job_info_count);
        PMIX_INFO_CREATE(proc_info, proc_info_count);
        /* PMIX_VAL_set_assign(_v, _field, _val )  */
        /* PMIX_VAL_set_strdup(_v, _field, _val )  */

        if(host_to_use != NULL){
                /* add info struct to the spawned app itself for the host */

                /* old way */
                strncpy(proc_info[proc_info_index].key, PMIX_HOST, PMIX_MAX_KEYLEN);
                //proc_info[proc_info_index].value.type = PMIX_STRING;
                /* set the data for host list to use */
                //proc_info[proc_info_index].value.data.string = host_to_use;
                /* end old way */
                if(verbose_print) printf("about to set host val\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), string,
                                    host_to_use );
                proc_info_index++;
        }

        if(!pmix_mode){
                strncpy(job_info[job_info_index].key, PMIX_NON_PMI,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set non pmix flag\n");
                PMIX_VAL_SET(&(job_info[job_info_index].value), flag, true);
                job_info_index++;
        }
        if(verbose_print){
                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDOUT,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stdout flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;

                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDERR,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stderr flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;
        }
        if(experimental){
                printf("attempting to perform experiment\n");
                bool local_flag = true;
                PMIX_INFO_LOAD(&job_info[job_info_index], PMIX_NOTIFY_COMPLETION, &local_flag, PMIX_BOOL);
                job_info_index++;
        }
        if(node_count == 1){
                strncpy(job_info[job_info_index].key, PMIX_PPR,
                        PMIX_MAX_KEYLEN);
                PMIX_VAL_SET(&(job_info[job_info_index].value), string,
                             "1:n");
                job_info_index++;
        }
        /*--END: append actual proc level info */

        /* sanity check to make sure we covered all the info structs */
        if(proc_info_index != proc_info_count ){
                printf("bug: mismatch with appending proc info\n");
                exit(1);
        }
        if(job_info_index != job_info_count){
                printf("bug: mismatch with appending job info\n");
                exit(1);
        }

        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */

        /* fill in job_info */
        /*
        strncpy(job_info[0].key, PMIX_TIMEOUT, PMIX_MAX_KEYLEN);
        job_info[0].value.type = PMIX_INT;
        job_info[0].value.data.integer = 10; */

        /* strncpy(job_info[0].key, PMIX_NOTIFY_COMPLETION, PMIX_MAX_KEYLEN);
           job_info[0].value.type = PMIX_BOOL;
           job_info[0].value.data.flag = true; */

        /*strncpy(spawned_app->info[0].key, PMIX_DISPLAY_MAP, PMIX_MAX_KEYLEN);
          job_info[0].value.type = PMIX_BOOL;
          job_info[0].value.data.flag = true;*/


        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */
        spawned_app->info = proc_info;
        spawned_app->ninfo = proc_info_count;

        if(verbose_print){
                printf("proc level info count: %d\n", proc_info_count);
        }
        /* call spawn */
        retval = PMIx_Spawn(job_info, job_info_count,
                            spawned_app, 1, spawned_nsp);

        if(verbose_print) {
                printf("rank %d (host %s) just called spawn; spawned nspace: %s, retval:%d\n",
                       main_proc.rank,
                       hostn,
                       spawned_nsp,
                       retval);
        }
        if(retval != PMIX_SUCCESS){
                error_helper(retval,  hostn, "error with spawn");
                goto done;
        }

        /* TODO: TEMPORARY WORKAROUND TO WAIT FOR A SPAWNED PROCESS */
        if(blocking_mode){

                sleep(fixed_sleep);

                /* wait until app completes: */
                while(!done_flag){
                        sleep(fixed_sleep);
                        temp_counter++;
                        if(temp_counter*fixed_sleep >= sleep_max) {
                                if(verbose_print) printf("broke out early\n");
                                break;
                        }
                }
                if(verbose_print){
                        if(done_flag == true) {
                                printf("done_flag was set to true!\n");
                        }
                }

        }

done:
        /* fence first */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS){
                if(verbose_print) printf("error fencing, finalize may fail ! \n");
        }
        /* finalize */

        PMIx_Deregister_event_handler(_g_errhandler_ref, NULL, NULL);

        if(verbose_print){
                fprintf(stdout,
                        "spawn master process (rank %d) (host %s) finalizing\n",
                        main_proc.rank,
                        hostn);
        }

        /* clean up pmix */

        retval = PMIx_tool_finalize();

        if(retval == PMIX_SUCCESS)
        {
                if(verbose_print){
                        printf("spawn master process %d finalize success\n\n",
                               main_proc.rank);
                }
        }
        else
        {
                printf("spawn master process %d pmix_finalize FAILURE: %d\n\n",
                       main_proc.rank,
                       retval);
        }

        retval = PMIx_Finalize(NULL, 0);
        fflush(stdout);

        /*  cleanup before returning */
        PMIX_INFO_FREE(job_info, job_info_count);
        spawned_app->argv = NULL;
        PMIX_APP_FREE(spawned_app, 1);
        if(verbose_print) printf("%s exiting cleanly :)\n", argv[0]);
        return 0;

}
Пример #24
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    pmix_info_t *info, *iptr;
    bool flag;
    mylock_t mylock;
    pmix_data_array_t *dptr;

    /* init us - note that the call to "init" includes the return of
     * any job-related info provided by the RM. */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);


    /* register our default event handler - again, this isn't strictly
     * required, but is generally good practice */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    /* wait for registration to complete */
    DEBUG_WAIT_THREAD(&mylock);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
        goto done;
    }

    /* job-related info is found in our nspace, assigned to the
     * wildcard rank as it doesn't relate to a specific rank. Setup
     * a name to retrieve such values */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;

    /* get our universe size */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);

    /* inform the RM that we are preemptible, and that our checkpoint methods are
     * "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */
    PMIX_INFO_CREATE(info, 2);
    flag = true;
    PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL);
    /* can't use "load" to load a pmix_data_array_t */
    (void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN);
    PMIX_DATA_ARRAY_CREATE(info[1].value.data.darray, 2, PMIX_INFO);
    dptr = info[1].value.data.darray;
    rc = SIGUSR2;
    iptr = (pmix_info_t*)dptr->array;
    PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT);
    rc = PMIX_JCTRL_CHECKPOINT;
    PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS);

    /* since this is informational and not a requested operation, the target parameter
     * doesn't mean anything and can be ignored */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&mylock))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_WAIT_THREAD(&mylock);
    PMIX_INFO_FREE(info, 2);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* now request that this process be monitored using heartbeats */
    PMIX_INFO_CREATE(iptr, 1);
    PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER);

    PMIX_INFO_CREATE(info, 3);
    PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING);
    n = 5;  // require a heartbeat every 5 seconds
    PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32);
    n = 2;  // two heartbeats can be missed before declaring us "stalled"
    PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32);

    /* make the request */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT,
                                                      info, 3, infocbfunc, (void*)&mylock))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_WAIT_THREAD(&mylock);
    PMIX_INFO_FREE(iptr, 1);
    PMIX_INFO_FREE(info, 3);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* send a heartbeat */
    PMIx_Heartbeat();

    /* call fence to synchronize with our peers - no need to
     * collect any info as we didn't "put" anything */
    PMIX_INFO_CREATE(info, 1);
    flag = false;
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    PMIX_INFO_FREE(info, 1);


  done:
    /* finalize us */
    fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
Пример #25
0
int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len, int same_len,
                          int roots_only, void **bc_table, size_t ** bc_indices)
{
    int rc, mpi_errno = MPI_SUCCESS;
    int start, end, i;
    char *val = NULL, *val_p;
    int out_len, val_len, rem, flag;
    pmix_value_t value, *pvalue;
    pmix_info_t *info;
    pmix_proc_t proc;
    int local_rank, local_leader;
    size_t my_bc_len = bc_len;

    MPIR_NODEMAP_get_local_info(rank, size, nodemap, &local_size, &local_rank, &local_leader);

    /* if business cards can be different length, use the max value length */
    if (!same_len)
        bc_len = VALLEN;
    mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    mpi_errno =
        MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank,
                             MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (size == 1) {
        memcpy(segment, bc, my_bc_len);
        goto single;
    }

    val = MPL_malloc(VALLEN, MPL_MEM_ADDRESS);
    memset(val, 0, VALLEN);
    val_p = val;
    rem = VALLEN;
    rc = MPL_str_add_binary_arg(&val_p, &rem, "mpi", (char *) bc, my_bc_len);
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**buscard");
    MPIR_Assert(rem >= 0);

    if (!roots_only || rank == local_leader) {
        value.type = PMIX_STRING;
        value.data.string = val;
        rc = PMIx_Put(PMIX_LOCAL, "bc", &value);
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_put");
        rc = PMIx_Put(PMIX_REMOTE, "bc", &value);
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_put");
        rc = PMIx_Commit();
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_commit");
    }

    PMIX_INFO_CREATE(info, 1);
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    rc = PMIx_Fence(&MPIR_Process.pmix_wcproc, 1, info, 1);
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_fence");
    PMIX_INFO_FREE(info, 1);

    if (!roots_only) {
        start = local_rank * (size / local_size);
        end = start + (size / local_size);
        if (local_rank == local_size - 1)
            end += size % local_size;
        for (i = start; i < end; i++) {
            PMIX_PROC_CONSTRUCT(&proc);
            MPL_strncpy(proc.nspace, MPIR_Process.pmix_proc.nspace, PMIX_MAX_NSLEN);
            proc.rank = i;
            rc = PMIx_Get(&proc, "bc", NULL, 0, &pvalue);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_get");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
            PMIX_VALUE_RELEASE(pvalue);
        }
    } else {
        int num_nodes, *node_roots;
        MPIR_NODEMAP_get_node_roots(nodemap, size, &node_roots, &num_nodes);

        start = local_rank * (num_nodes / local_size);
        end = start + (num_nodes / local_size);
        if (local_rank == local_size - 1)
            end += num_nodes % local_size;
        for (i = start; i < end; i++) {
            PMIX_PROC_CONSTRUCT(&proc);
            MPL_strncpy(proc.nspace, MPIR_Process.pmix_proc.nspace, PMIX_MAX_NSLEN);
            proc.rank = i;
            rc = PMIx_Get(&proc, "bc", NULL, 0, &pvalue);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_get");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
            PMIX_VALUE_RELEASE(pvalue);
        }
    }
    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

  single:
    if (!same_len) {
        indices = MPL_malloc(size * sizeof(size_t), MPL_MEM_ADDRESS);
        for (i = 0; i < size; i++)
            indices[i] = bc_len * i;
        *bc_indices = indices;
    }

  fn_exit:
    MPL_free(val);
    *bc_table = segment;

    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
Пример #26
0
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    char *tmp;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    pmix_info_t *info;
    bool flag;
    volatile int active;
    pmix_status_t dbg = PMIX_ERR_DEBUGGER_RELEASE;

    /* init us - note that the call to "init" includes the return of
     * any job-related info provided by the RM. This includes any
     * debugger flag instructing us to stop-in-init. If such a directive
     * is included, then the process will be stopped in this call until
     * the "debugger release" notification arrives */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);


    /* register our default event handler - again, this isn't strictly
     * required, but is generally good practice */
    active = -1;
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&active);
    while (-1 == active) {
        sleep(1);
    }
    if (0 != active) {
        fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
        exit(active);
    }

    /* job-related info is found in our nspace, assigned to the
     * wildcard rank as it doesn't relate to a specific rank. Setup
     * a name to retrieve such values */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;

    /* check to see if we have been instructed to wait for a debugger
     * to attach to us. We won't get both a stop-in-init AND a
     * wait-for-notify directive, so we should never stop twice. This
     * directive is provided so that something like an MPI implementation
     * can do some initial setup in MPI_Init prior to pausing for the
     * debugger */
    if (PMIX_SUCCESS == (rc = PMIx_Get(&proc, PMIX_DEBUG_WAIT_FOR_NOTIFY, NULL, 0, &val))) {
        /* register for debugger release */
        active = -1;
        PMIx_Register_event_handler(&dbg, 1, NULL, 0,
                                    release_fn, evhandler_reg_callbk, (void*)&active);
        /* wait for registration to complete */
        while (-1 == active) {
            sleep(1);
        }
        if (0 != active) {
            fprintf(stderr, "[%s:%d] Debug handler registration failed\n", myproc.nspace, myproc.rank);
            exit(active);
        }
        /* wait for debugger release */
        while (waiting_for_debugger) {
            sleep(1);
        }
    }

    /* get our universe size */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, val->data.uint32);
    /* get the number of procs in our job - univ size is the total number of allocated
     * slots, not the number of procs in the job */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get job size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "Client %s:%d num procs %d\n", myproc.nspace, myproc.rank, nprocs);

    /* put a few values */
    if (0 > asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_UINT32;
    value.data.uint32 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Store_internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_UINT64;
    value.data.uint64 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_STRING;
    value.data.string = "1234";
    if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    /* push the data to our PMIx server */
    if (PMIX_SUCCESS != (rc = PMIx_Commit())) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Commit failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* call fence to synchronize with our peers - instruct
     * the fence operation to collect and return all "put"
     * data from our peers */
    PMIX_INFO_CREATE(info, 1);
    flag = true;
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    PMIX_INFO_FREE(info, 1);

    /* check the returned data */
    for (n=0; n < nprocs; n++) {
        if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank)) {
            exit(1);
        }
        if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, tmp, NULL, 0, &val))) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, myproc.rank, tmp, rc);
            goto done;
        }
        if (PMIX_UINT64 != val->type) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        if (1234 != val->data.uint64) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong value: %d\n", myproc.nspace, myproc.rank, tmp, (int)val->data.uint64);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned correct\n", myproc.nspace, myproc.rank, tmp);
        PMIX_VALUE_RELEASE(val);
        free(tmp);
        if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank)) {
            exit(1);
        }
        if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, tmp, NULL, 0, &val))) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, myproc.rank, tmp, rc);
            goto done;
        }
        if (PMIX_STRING != val->type) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        if (0 != strcmp(val->data.string, "1234")) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong value: %s\n", myproc.nspace, myproc.rank, tmp, val->data.string);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned correct\n", myproc.nspace, myproc.rank, tmp);
        PMIX_VALUE_RELEASE(val);
        free(tmp);
    }

 done:
    /* finalize us */
    fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
Пример #27
0
int pmixp_libpmix_job_set(void)
{
	List lresp;
	pmix_info_t *info;
	int ninfo;
	ListIterator it;
	pmix_info_t *kvp;

	int i, rc;
	uid_t uid = pmixp_info_jobuid();
	gid_t gid = pmixp_info_jobgid();
	_register_caddy_t *register_caddy;

	register_caddy = xmalloc(sizeof(_register_caddy_t)*(pmixp_info_tasks_loc()+1));
	pmixp_debug_hang(0);

	/* Use list to safely expand/reduce key-value pairs. */
	lresp = list_create(pmixp_xfree_xmalloced);

	_general_proc_info(lresp);

	_set_tmpdirs(lresp);

	_set_procdatas(lresp);

	_set_sizeinfo(lresp);

	if (SLURM_SUCCESS != _set_mapsinfo(lresp)) {
		list_destroy(lresp);
		PMIXP_ERROR("Can't build nodemap");
		return SLURM_ERROR;
	}

	_set_localinfo(lresp);

	ninfo = list_count(lresp);
	PMIX_INFO_CREATE(info, ninfo);
	it = list_iterator_create(lresp);
	i = 0;
	while (NULL != (kvp = list_next(it))) {
		info[i] = *kvp;
		i++;
	}
	list_destroy(lresp);

	register_caddy[0].active = 1;
	rc = PMIx_server_register_nspace(pmixp_info_namespace(),
			pmixp_info_tasks_loc(), info, ninfo, _release_cb,
			&register_caddy[0]);

	if (PMIX_SUCCESS != rc) {
		PMIXP_ERROR("Cannot register namespace %s, nlocalproc=%d, "
			    "ninfo = %d", pmixp_info_namespace(),
			    pmixp_info_tasks_loc(), ninfo);
		return SLURM_ERROR;
	}

	PMIXP_DEBUG("task initialization");
	for (i = 0; i < pmixp_info_tasks_loc(); i++) {
		pmix_proc_t proc;
		register_caddy[i+1].active = 1;
		strncpy(proc.nspace, pmixp_info_namespace(), PMIX_MAX_NSLEN);
		proc.rank = pmixp_info_taskid(i);
		rc = PMIx_server_register_client(&proc, uid, gid, NULL,
				_release_cb, &register_caddy[i + 1]);
		if (PMIX_SUCCESS != rc) {
			PMIXP_ERROR("Cannot register client %d(%d) in namespace %s",
				    pmixp_info_taskid(i), i,
				    pmixp_info_namespace());
			return SLURM_ERROR;
		}
	}

	/* wait for all registration actions to finish */
	while( 1 ){
		int exit_flag = 1;
		struct timespec ts;
		ts.tv_sec = 0;
		ts.tv_nsec = 100;

		for(i=0; i <  pmixp_info_tasks_loc() + 1; i++){
			if( register_caddy[i].active ){
				exit_flag = 0;
			}
		}
		if( exit_flag ){
			break;
		}
		nanosleep(&ts, NULL);
	}
	PMIX_INFO_FREE(info, ninfo);
	xfree(register_caddy);

	return SLURM_SUCCESS;
}