示例#1
0
static pmix_status_t spawn_debugger(char *appspace, myrel_t *myrel)
{
    pmix_status_t rc;
    pmix_info_t *dinfo;
    pmix_app_t *debugger;
    size_t dninfo;
    char cwd[1024];
    char dspace[PMIX_MAX_NSLEN+1];
    mylock_t mylock;
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;

    /* setup the debugger */
    PMIX_APP_CREATE(debugger, 1);
    debugger[0].cmd = strdup("./debuggerd");
    PMIX_ARGV_APPEND(rc, debugger[0].argv, "./debuggerd");
    getcwd(cwd, 1024);  // point us to our current directory
    debugger[0].cwd = strdup(cwd);
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    dninfo = 6;
    PMIX_INFO_CREATE(dinfo, dninfo);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUGGER_DAEMONS, NULL, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUG_JOB, appspace, PMIX_STRING); // the nspace being debugged
    PMIX_INFO_LOAD(&dinfo[2], PMIX_NOTIFY_COMPLETION, NULL, PMIX_BOOL); // notify us when the debugger job completes
    PMIX_INFO_LOAD(&dinfo[3], PMIX_DEBUG_WAITING_FOR_NOTIFY, NULL, PMIX_BOOL);  // tell the daemon that the proc is waiting to be released
    PMIX_INFO_LOAD(&dinfo[4], PMIX_FWD_STDOUT, NULL, PMIX_BOOL);  // forward stdout to me
    PMIX_INFO_LOAD(&dinfo[5], PMIX_FWD_STDERR, NULL, PMIX_BOOL);  // forward stderr to me
    /* spawn the daemons */
    fprintf(stderr, "Debugger: spawning %s\n", debugger[0].cmd);
    if (PMIX_SUCCESS != (rc = PMIx_Spawn(dinfo, dninfo, debugger, 1, dspace))) {
        fprintf(stderr, "Debugger daemons failed to launch with error: %s\n", PMIx_Error_string(rc));
        PMIX_INFO_FREE(dinfo, dninfo);
        PMIX_APP_FREE(debugger, 1);
        return rc;
    }
    /* cleanup */
    PMIX_INFO_FREE(dinfo, dninfo);
    PMIX_APP_FREE(debugger, 1);

    /* register callback for when this job terminates */
    myrel->nspace = strdup(dspace);
    PMIX_INFO_CREATE(dinfo, 2);
    PMIX_INFO_LOAD(&dinfo[0], PMIX_EVENT_RETURN_OBJECT, myrel, PMIX_POINTER);
    /* only call me back when this specific job terminates */
    PMIX_INFO_LOAD(&dinfo[1], PMIX_NSPACE, dspace, PMIX_STRING);

    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(&code, 1, dinfo, 2,
                                release_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    PMIX_INFO_FREE(dinfo, 2);

    return rc;
}
示例#2
0
文件: pmi2.c 项目: AT95/ompi
/* KVS_Fence */
int PMI2_KVS_Fence(void)
{
    pmix_status_t rc = PMIX_SUCCESS;

    PMI2_CHECK();

    pmix_output_verbose(3, pmix_globals.debug_output, "PMI2_KVS_Fence");

    if (PMIX_SUCCESS != (rc = PMIx_Commit())) {
        return convert_err(rc);
    }

    /* we want all data to be collected upon completion */
    {
        pmix_info_t info[1];
        bool  val_data = 1;

        /* set controlling parameters
         * PMIX_COLLECT_DATA - meet legacy PMI2 requirement
         */
        PMIX_INFO_CONSTRUCT(&info[0]);
        PMIX_INFO_LOAD(&info[0], PMIX_COLLECT_DATA, &val_data, PMIX_BOOL);

        rc = PMIx_Fence(NULL, 0, &info[0], 1);
        PMIX_INFO_DESTRUCT(&info[0]);
    }

    return convert_err(rc);
}
示例#3
0
文件: pmi2.c 项目: AT95/ompi
int PMI2_Info_GetSize(int *size)
{
    pmix_status_t rc = PMIX_ERROR;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;

    PMI2_CHECK();

    if (NULL == size) {
        return PMI2_ERR_INVALID_ARGS;
    }

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    if (PMIX_SUCCESS == PMIx_Get(&myproc, PMIX_LOCAL_SIZE, info, 1, &val)) {
        rc = convert_int(size, val);
        PMIX_VALUE_RELEASE(val);
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#4
0
文件: pmi1.c 项目: ashleypittman/pmix
PMIX_EXPORT int PMI_Init(int *spawned)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t info[1];
    bool  val_optinal = 1;

    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        /* if we didn't see a PMIx server (e.g., missing envar),
         * then allow us to run as a singleton */
        if (PMIX_ERR_INVALID_NAMESPACE == rc) {
            if (NULL != spawned) {
                *spawned = 0;
            }
            pmi_singleton = true;
            (void)strncpy(myproc.nspace, "1234", PMIX_MAX_NSLEN);
            myproc.rank = 0;
            pmi_init = 1;
            return PMI_SUCCESS;
        }
        return PMI_ERR_INIT;
    }

    /* getting internal key requires special rank value */
    memcpy(&proc, &myproc, sizeof(myproc));
    proc.rank = PMIX_RANK_UNDEF;

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    if (NULL != spawned) {
        /* get the spawned flag */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_SPAWNED, info, 1, &val)) {
            rc = convert_int(spawned, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* if not found, default to not spawned */
            *spawned = 0;
        }
    }
    pmi_init = 1;

    rc = PMIX_SUCCESS;

error:
    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#5
0
文件: error.c 项目: brminich/ompi
void pmix_errhandler_invoke(pmix_status_t status,
                            pmix_proc_t procs[], size_t nprocs,
                            pmix_info_t info[], size_t ninfo)
{
    /* We need to parse thru each registered handler and determine
     * which one to call for the specific error */
    int i, idflt;
    size_t j;
    bool fired = false;
    pmix_error_reg_info_t *errreg, *errdflt=NULL;
    pmix_info_t *iptr;

    PMIX_INFO_CREATE(iptr, ninfo+1);
    (void)strncpy(iptr[0].key, PMIX_ERROR_HANDLER_ID, PMIX_MAX_KEYLEN);
    iptr[0].value.type = PMIX_INT;
    if (NULL != info) {
        for (j=0; j < ninfo; j++) {
            PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
        }
    }

    for (i = 0; i < pmix_globals.errregs.size; i++) {
        if (NULL == (errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item(&pmix_globals.errregs, i))) {
            continue;
        }
        if (NULL == errreg->info || 0 == errreg->ninfo) {
            // this is a general err handler - we will call it if there is no better match
            errdflt = errreg;
            idflt = i;
            continue;
        }
        iptr[0].value.data.integer = i;
        /* match error name key first */
        for (j = 0; j < errreg->ninfo; j++) {
            if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) &&
                (status == errreg->info[j].value.data.int32)) {
                    iptr[0].value.data.integer = i;
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                    break;
            }
        }
    }

    /* if nothing fired and we found a general err handler, then fire it */
    if (!fired && NULL != errdflt) {
        iptr[0].value.data.integer = idflt;
        errdflt->errhandler(status, procs, nprocs, iptr, ninfo+1);
    }
    /* cleanup */
    PMIX_INFO_FREE(iptr, ninfo+1);
}
示例#6
0
文件: pmi2.c 项目: thananon/ompi
PMIX_EXPORT int PMI2_Info_GetNodeAttr(const char name[],
                                      char value[], int valuelen,
                                      int *found, int waitfor)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;
    pmix_proc_t proc = myproc;
    proc.rank = PMIX_RANK_UNDEF;

    PMI2_CHECK();

    if ((NULL == name) || (NULL == value) || (NULL == found)) {
        return PMI2_ERR_INVALID_ARG;
    }

    if (pmi2_singleton) {
        return PMI2_FAIL;
    }

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    *found = 0;
    /* TODO: does PMI2's "name" makes sense to PMIx? */
    rc = PMIx_Get(&proc, name, info, 1, &val);
    if (PMIX_SUCCESS == rc && NULL != val) {
        if (PMIX_STRING != val->type) {
            rc = PMIX_ERROR;
        } else if (NULL != val->data.string) {
            (void)strncpy(value, val->data.string, valuelen);
            *found = 1;
        }
        PMIX_VALUE_RELEASE(val);
    } else if (PMIX_ERR_NOT_FOUND == rc) {
        rc = PMIX_SUCCESS;
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#7
0
文件: pmi2.c 项目: AT95/ompi
int PMI2_Info_GetJobAttr(const char name[], char value[], int valuelen, int *found)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t info[1];
    bool  val_optinal = 1;

    PMI2_CHECK();

    if ((NULL == name) || (NULL == value) || (NULL == found)) {
        return PMI2_ERR_INVALID_ARG;
    }

    /* getting internal key requires special rank value */
    memcpy(&proc, &myproc, sizeof(myproc));
    proc.rank = PMIX_RANK_UNDEF;

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    *found = 0;
    rc = PMIx_Get(&proc, name, info, 1, &val);
    if (PMIX_SUCCESS == rc && NULL != val) {
        if (PMIX_STRING != val->type) {
            rc = PMIX_ERROR;
        } else if (NULL != val->data.string) {
            (void)strncpy(value, val->data.string, valuelen);
            *found = 1;
        }
        PMIX_VALUE_RELEASE(val);
    } else if (PMIX_ERR_NOT_FOUND == rc) {
        rc = PMIX_SUCCESS;
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#8
0
文件: pmi1.c 项目: ashleypittman/pmix
PMIX_EXPORT int PMI_Get_appnum(int *appnum)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;
    pmix_proc_t proc = myproc;
    proc.rank = PMIX_RANK_WILDCARD;

    PMI_CHECK();

    if (NULL == appnum) {
        return PMI_ERR_INVALID_ARG;
    }

    if (pmi_singleton) {
        *appnum = 0;
        return PMI_SUCCESS;
    }

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    rc = PMIx_Get(&proc, PMIX_APPNUM, info, 1, &val);
    if (PMIX_SUCCESS == rc) {
        rc = convert_int(appnum, val);
        PMIX_VALUE_RELEASE(val);
    } else if( PMIX_ERR_NOT_FOUND == rc ){
        /* this is optional value, set to 0 */
        *appnum = 0;
        rc = PMIX_SUCCESS;
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#9
0
文件: pmi1.c 项目: ashleypittman/pmix
/* Barrier only applies to our own nspace, and we want all
 * data to be collected upon completion */
PMIX_EXPORT int PMI_Barrier(void)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_info_t buf;
    int ninfo = 0;
    pmix_info_t *info = NULL;
    bool val = 1;

    PMI_CHECK();

    if (pmi_singleton) {
        return PMI_SUCCESS;
    }

    info = &buf;
    PMIX_INFO_CONSTRUCT(info);
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &val, PMIX_BOOL);
    ninfo = 1;
    rc = PMIx_Fence(NULL, 0, info, ninfo);

    PMIX_INFO_DESTRUCT(info);

    return convert_err(rc);
}
示例#10
0
文件: pmi1.c 项目: ashleypittman/pmix
PMIX_EXPORT int PMI_Get_clique_size(int *size)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;
    pmix_proc_t proc = myproc;
    proc.rank = PMIX_RANK_WILDCARD;

    PMI_CHECK();

    if (NULL == size) {
        return PMI_ERR_INVALID_ARG;
    }

    if (pmi_singleton) {
        *size = 1;
        return PMI_SUCCESS;
    }

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    rc = PMIx_Get(&proc, PMIX_LOCAL_SIZE, info, 1, &val);
    if (PMIX_SUCCESS == rc) {
        rc = convert_int(size, val);
        PMIX_VALUE_RELEASE(val);
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#11
0
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    char *tmp;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    int cnt, j;
    bool doabort = false;
    volatile bool active;
    pmix_info_t info, *iptr;
    size_t ninfo;
    pmix_status_t code;

    if (1 < argc) {
        if (0 == strcmp("-abort", argv[1])) {
            doabort = true;
        }
    }

    /* init us and declare we are a test programming model */
    PMIX_INFO_CREATE(iptr, 2);
    PMIX_INFO_LOAD(&iptr[0], PMIX_PROGRAMMING_MODEL, "TEST", PMIX_STRING);
    PMIX_INFO_LOAD(&iptr[1], PMIX_MODEL_LIBRARY_NAME, "PMIX", PMIX_STRING);
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, iptr, 2))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    PMIX_INFO_FREE(iptr, 2);
    pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank);

    /* test something */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    PMIX_VALUE_RELEASE(val);

    /* test something */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_SERVER_URI, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        exit(rc);
    }
    pmix_output(0, "CLIENT SERVER URI: %s", val->data.string);
    PMIX_VALUE_RELEASE(val);

    /* register a handler specifically for when models declare */
    active = true;
    ninfo = 1;
    PMIX_INFO_CREATE(iptr, ninfo);
    PMIX_INFO_LOAD(&iptr[0], PMIX_EVENT_HDLR_NAME, "SIMPCLIENT-MODEL", PMIX_STRING);
    code = PMIX_MODEL_DECLARED;
    PMIx_Register_event_handler(&code, 1, iptr, ninfo,
                                model_callback, model_registration_callback, (void*)&active);
    while (active) {
        usleep(10);
    }
    PMIX_INFO_FREE(iptr, ninfo);

    /* register our errhandler */
    active = true;
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, errhandler_reg_callbk, (void*)&active);
    while (active) {
        usleep(10);
    }


    /* get our universe size */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs);

    /* put a few values */
    (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank);
    value.type = PMIX_UINT32;
    value.data.uint32 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) {
        pmix_output(0, "Client ns %s rank %d: PMIx_Store_internal failed: %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
        goto done;
    }

    for (cnt=0; cnt < MAXCNT; cnt++) {
        (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, myproc.rank, cnt);
        value.type = PMIX_UINT64;
        value.data.uint64 = 1234;
        if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s",
                        myproc.nspace, myproc.rank, PMIx_Error_string(rc));
            goto done;
        }

        (void)asprintf(&tmp, "%s-%d-remote-%d", myproc.nspace, myproc.rank, cnt);
        value.type = PMIX_STRING;
        value.data.string = "1234";
        if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) {
            pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s",
                        myproc.nspace, myproc.rank, PMIx_Error_string(rc));
            goto done;
        }

        if (PMIX_SUCCESS != (rc = PMIx_Commit())) {
            pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Commit failed: %s",
                        myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc));
            goto done;
        }

        /* call fence to ensure the data is received */
        PMIX_PROC_CONSTRUCT(&proc);
        (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
        proc.rank = PMIX_RANK_WILDCARD;
        if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) {
            pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Fence failed: %s",
                        myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc));
            goto done;
        }

        /* check the returned data */
        (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
        for (j=0; j <= cnt; j++) {
            for (n=0; n < nprocs; n++) {
                proc.rank = n;
                (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, n, j);
                if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s failed: %s",
                                myproc.nspace, myproc.rank, j, tmp, PMIx_Error_string(rc));
                    continue;
                }
                if (NULL == val) {
                    pmix_output(0, "Client ns %s rank %d: NULL value returned",
                                myproc.nspace, myproc.rank);
                    break;
                }
                if (PMIX_UINT64 != val->type) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong type: %d", myproc.nspace, myproc.rank, j, tmp, val->type);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                    continue;
                }
                if (1234 != val->data.uint64) {
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong value: %d", myproc.nspace, myproc.rank, j, tmp, (int)val->data.uint64);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                    continue;
                }
                pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
                PMIX_VALUE_RELEASE(val);
                free(tmp);

                if (n != myproc.rank) {
                    (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j);
                    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) {
                        /* this data should _not_ be found as we are on the same node
                         * and the data was "put" with a PMIX_REMOTE scope */
                        pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp);
                        continue;
                    }
                    pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc",
                                myproc.nspace, myproc.rank, j, tmp);
                    PMIX_VALUE_RELEASE(val);
                    free(tmp);
                }
            }
        }
    }

    /* now get the data blob for myself */
    pmix_output(0, "Client ns %s rank %d testing internal modex blob",
                myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS == (rc = PMIx_Get(&myproc, NULL, NULL, 0, &val))) {
        if (PMIX_DATA_ARRAY != val->type) {
            pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob",
                        myproc.nspace, myproc.rank);
            PMIX_VALUE_RELEASE(val);
        } else if (PMIX_INFO != val->data.darray->type) {
            pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO",
                        myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type));
            PMIX_VALUE_RELEASE(val);
        } else if (0 == val->data.darray->size) {
            pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length",
                        myproc.nspace, myproc.rank);
            PMIX_VALUE_RELEASE(val);
        } else {
            pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array;
            for (n=0; n < val->data.darray->size; n++) {
                pmix_output(0, "\tKey: %s", iptr[n].key);
            }
            PMIX_VALUE_RELEASE(val);
        }
    } else {
        pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc);
    }

    /* log something */
    PMIX_INFO_CONSTRUCT(&info);
    PMIX_INFO_LOAD(&info, PMIX_LOG_STDERR, "test log msg", PMIX_STRING);
    active = true;
    rc = PMIx_Log_nb(&info, 1, NULL, 0, opcbfunc, (void*)&active);
    if (PMIX_SUCCESS != rc) {
        pmix_output(0, "Client ns %s rank %d - log_nb returned %s",
                    myproc.nspace, myproc.rank, PMIx_Error_string(rc));
    } else {
        while (active) {
            usleep(10);
        }
    }
    PMIX_INFO_DESTRUCT(&info);

    /* if requested and our rank is 0, call abort */
    if (doabort) {
        if (0 == myproc.rank) {
            PMIx_Abort(PMIX_ERR_PROC_REQUESTED_ABORT, "CALLING ABORT", NULL, 0);
        } else {
            while(!completed) {
                usleep(10);
            }
        }
    }

 done:
    /* finalize us */
    pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %s\n",
                myproc.nspace, myproc.rank, PMIx_Error_string(rc));
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(rc);
}
示例#12
0
文件: pmi2.c 项目: AT95/ompi
int PMI2_Init(int *spawned, int *size, int *rank, int *appnum)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t info[1];
    bool  val_optinal = 1;

    if (PMIX_SUCCESS != PMIx_Init(&myproc)) {
        return PMI2_ERR_INIT;
    }

    /* get the rank */
    *rank = myproc.rank;

    /* getting internal key requires special rank value */
    memcpy(&proc, &myproc, sizeof(myproc));
    proc.rank = PMIX_RANK_UNDEF;

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    if (NULL != size) {
        /* get the universe size - this will likely pull
         * down all attributes assigned to the job, thus
         * making all subsequent "get" operations purely
         * local */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_UNIV_SIZE, info, 1, &val)) {
            rc = convert_int(size, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* cannot continue without this info */
            rc = PMIX_ERR_INIT;
            goto error;
        }
    }

    if (NULL != spawned) {
        /* get the spawned flag */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_SPAWNED, info, 1, &val)) {
            rc = convert_int(spawned, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* if not found, default to not spawned */
            *spawned = 0;
        }
    }

    if (NULL != appnum) {
        /* get our appnum */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_APPNUM, info, 1, &val)) {
            rc = convert_int(appnum, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* if not found, default to 0 */
            *appnum = 0;
        }
    }
    pmi2_init = 1;

    rc = PMIX_SUCCESS;

error:
    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#13
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_info_t *info;
    pmix_app_t *app;
    size_t ninfo, napps;

    /* check for user directives - this would include:
     * - a flag indicating we want to attach to a specified application
     * - application info if we are launching a new app
     */

    /* init us - if a PMIx server pid was provided, then pass it along */
    if (0 < server_pid) {
        ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        PMIX_INFO_LOAD(&info[0], PMIX_SERVER_PIDINFO, server_pid, PMIX_UINT32);
    } else {
        info = NULL;
        ninfo = 0;
    }
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) {
        fprintf(stderr, "PMIx_tool_init failed: %d\n", rc);
        exit(rc);
    }
    if (0 < ninfo) {
        PMIX_INFO_FREE(info, ninfo);
    }
    fprintf(stderr, "Tool ns %s rank %d: Running\n", myproc.nspace, myproc.rank);

    /* if we are attaching to a running job, then attach to it */
    if (attach) {
        ret = attach_to_running_job(argv[1]);
    } else {
        /* this is an initial launch - we need to launch the application
         * plus the debugger daemons, letting the RM know we are debugging
         * so that it will "pause" the app procs until we are ready */
        napps = 2;
        PMIX_APP_CREATE(app, napps);
        /* setup the executable */
        app[0].cmd = strdup("app");
        app[0].argc = 1;
        app[0].argv = (char**)malloc(2*sizeof(char*));
        app[0].argv[0] = strdup("app");
        app[0].argv[1] = NULL;
        /* provide directives so the apps do what the user requested */
        ninfo = 2;
        PMIX_INFO_CREATE(app[0].info, ninfo);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_NP, 128, PMIX_UINT64);
        PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "slot", PMIX_STRING);

        /* setup the name of the daemon executable to launch */
        app[1].cmd = strdup("debuggerdaemon");
        app[1].argc = 1;
        app[1].argv = (char**)malloc(2*sizeof(char*));
        app[1].argv[0] = strdup("debuggerdaemon");
        app[1].argv[1] = NULL;
        /* provide directives so the daemons go where we want, and
         * let the RM know these are debugger daemons */
        ninfo = 2;
        PMIX_INFO_CREATE(app[1].info, ninfo);
        PMIX_INFO_LOAD(&app[1].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
        PMIX_INFO_LOAD(&app[1].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
        /* spawn the daemons */
        PMIx_Spawn(NULL, 0, app, napps, dspace);
        /* cleanup */
        PMIX_APP_FREE(app, napps);

        /* this is where a debugger tool would wait until the debug operation is complete */
    }


 done:
    PMIx_tool_finalize(NULL, 0);

    return(ret);
}
示例#14
0
文件: scr_pmix_spawn.c 项目: LLNL/scr
int main(int argc, char **argv, const char **environ)
{

        pmix_status_t rc;

        pmix_info_t *info = NULL;

        bool flag;

        pmix_status_t retval;
        pmix_app_t *spawned_app = NULL;

        pmix_info_t *job_info = NULL;
        pmix_info_t *proc_info = NULL;
        int job_info_count = 0;
        int job_info_index = 0;
        int proc_info_count = 0;
        int proc_info_index = 0;

        char spawned_nsp[PMIX_MAX_NSLEN+1];
        char *path_to_app = NULL;
        char *host_to_use = NULL;
        int number_of_clients = 0;
        int temp_counter = 0;
        done_flag = false;
        gethostname(hostn, 500);
        int spawned_app_argc = 0;

        char **scr_environ = NULL;

        int proc_count = 1;
        int node_count = 0;
        bool blocking_mode = true;
        char *node_list = NULL;
        bool forward_all_scr_envs = false;
        bool pmix_mode = false;


        const char *optstring = "+n:N:L:x:bB:pPvhe";
        int temp_slen=0;
        /* todo: add arg parsing with ompi schizo */
        verbose_print = false;

        int sleep_max = 30;
        const int fixed_sleep = 5;
        int c;
        while((c = getopt(argc, argv, optstring)) != -1){
                switch(c){
                case 'h':
                        print_usage(argv[0]);
                        exit(0);
                        break;
                case 'n':
                        proc_count = atoi(optarg);
                        if(proc_count <= 0 || proc_count > 100){
                                printf("outside the range of allowable instances to spawn [1-100]\n");
                                exit(1);
                        }
                        if(verbose_print) {
                                printf("proc_count = %d\n", proc_count);
                        }
                        break;
                case 'N':
                        /* node_count = atoi(optarg); */
                        node_count = 1;
                        if(verbose_print) {
                                printf("node_count = %d\n", node_count);
                        }
                        break;
                case 'B':
                        blocking_mode = true;
                        sleep_max = atoi(optarg);
                        if(sleep_max < 0){
                                printf("can't sleep for less than 0 seconds\n");
                                exit(1);
                        }
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'b':
                        blocking_mode = false;
                        if(verbose_print){
                                printf("blocking mode = %x\n", blocking_mode);
                        }
                        break;
                case 'L':
                        node_list = optarg;
                        host_to_use = node_list;
                        if(verbose_print){
                                printf("node_list = '%s'\n", node_list);
                        }
                        break;
                case 'x':
                        temp_slen = strlen(optarg);
                        /*  check if the string is the same length as 'SCR', if so compare them */
                        if(temp_slen == strlen(SCR_STRING)){
                                if(strncmp(optarg, SCR_STRING, strlen(SCR_STRING)) == 0){
                                        /* if the string is SCR, then forward all SCR related env vars */
                                        if(verbose_print) printf("all scr envs will be forwarded\n");
                                        forward_all_scr_envs = true;
                                }
                                else{
                                        /* handled like a normal env var */
                                        handle_standard_env_var(optarg, &scr_environ);
                                }
                        }
                        else{
                                /*handled like a normal env var */
                                handle_standard_env_var(optarg, &scr_environ);
                        }
                        break;
                case 'v':
                        verbose_print = true;
                        break;
                case 'p':
                        pmix_mode = true;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'P':
                        pmix_mode = false;
                        if(verbose_print){
                                printf("pmix_mode = %x\n", pmix_mode);
                        }
                        break;
                case 'e':
                        experimental = true;
                        break;
                case '?':
                        printf("missing a required argument or invalid option: %x\n", optopt);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                default:
                        printf("Unrecognized argument: %c\n", c);
                        print_usage(argv[0]);
                        exit(1);
                        break;
                }
        }

        /* number of instances to spawn */
        number_of_clients = proc_count;


        /* check to make sure an application was specified to launch */
        if( optind < argc ){
                /* if optind is < argc, it means there is at least one more arg
                 * beyond the args for this program */
                path_to_app = argv[optind];
                spawned_app_argc = argc - optind;
                if(verbose_print) {
                        printf("app to launch: %s @ %s:%d\n",
                               path_to_app, __FILE__, __LINE__);
                }
        }
        else{
                printf("program_to_spawn option was not provded\n");
                print_usage(argv[0]);
                exit(1);
        }

        if(verbose_print){
                printf("master process will spawn %d instances; app to run: %s\n\n",
                       number_of_clients, path_to_app);
                printf("pmix version: %s (host: %s)\n", PMIx_Get_version(), hostn);
        }
        /* init pmix */
        retval = PMIx_Init(&main_proc, NULL, 0);
        if(retval != PMIX_SUCCESS){
                error_helper(retval, hostn, "error initializing pmix");
                exit(0);
        }

        if(verbose_print){
                printf("rank %d, host '%s', nspace: '%s' init'd pmix succesfully\n\n",
                       main_proc.rank, hostn, main_proc.nspace);
        }



        /* we need to attach to a "system" PMIx server so we
         * can ask it to spawn applications for us. There can
         * only be one such connection on a node, so we will
         * instruct the tool library to only look for it */
        int ninfo = 1;
        PMIX_INFO_CREATE(info, ninfo);
        flag = true;
        PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_TO_SYSTEM, &flag, PMIX_BOOL);

        /* initialize the library and make the connection */
        if (PMIX_SUCCESS != (rc = PMIx_tool_init(&tool_proc, NULL, 0 ))) {
                fprintf(stderr, "PMIx_tool_init failed: %d\n", rc );
                exit(rc);
        }
        if (0 < ninfo) {
                PMIX_INFO_FREE(info, ninfo);
        }




        /* first call fence to sync all processes */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS)
        {
                error_helper(retval, hostn, "error fencing");
                exit(retval);
        }

        /* Process SCR env vars if needed */

        if(forward_all_scr_envs){
                parse_all_scr_envs(&scr_environ, environ);
        }

        /* finalize the env array so a NULL is in place */
        finalize_array(scr_environ);

        /* Setup info structs to pass to this: */
        /* pmix_info_t *error_info = NULL; */
        /*  PMIX_INFO_CREATE(error_info, 1); */
        /*
          strncpy(error_info[0].key, PMIX_ERROR_GROUP_ABORT, PMIX_MAX_KEYLEN);
          error_info[0].value.type = PMIX_BOOL;
          error_info[0].value.data.flag = true;
        */

        /*  strncpy(error_info[0].key, PMIX_ERROR_GROUP_SPAWN, PMIX_MAX_KEYLEN);
            int t_val = 1;
            pmix_value_load(&error_info[1].value, &t_val, PMIX_BOOL);
        */

        /*error_info[1].value.type = PMIX_BOOL;
        error_info[1].value.data.flag = true; */

        /*  strncpy(error_info[2].key, PMIX_ERROR_GROUP_GENERAL, PMIX_MAX_KEYLEN);
            error_info[2].value.type = PMIX_BOOL;
            error_info[2].value.data.flag = true;
        */


        /* TODO: setup error handling when implemented in pmix with the
         * following error codes: */
        /*
        pmix_status_t registered_codes[5];
        registered_codes[0] = PMIX_ERR_JOB_TERMINATED;
        registered_codes[1] = PMIX_ERR_PROC_ABORTED;
        registered_codes[2] = PMIX_ERR_PROC_ABORTING;
        */
        PMIx_Register_event_handler(NULL, 0,
                                    NULL, 0,
                                    errhandler_cb,
                                    errhandler_reg_callbk,
                                    (void *) NULL);

        /*  PMIX_INFO_DESTRUCT(error_info); */

        /* allocate memory to hold the spawend app struct */
        PMIX_APP_CREATE(spawned_app, 1);

        /* maxprocs isn't documented very well, but it appears to control
         * how many instances of the spanwed app are created */
        spawned_app->maxprocs = number_of_clients;

        /* set the app to run */
        (void)asprintf(&spawned_app->cmd, "%s", path_to_app);

        /* set argv for spawned app starting with remaining argv  */
        spawned_app->argv = &argv[optind];

        /* set the environment pointer */
        spawned_app->env = scr_environ;

        /*--START: add all proc level infos */

        /* add things to the proc level info */
        if(!pmix_mode){
                job_info_count++;
        }

        if(host_to_use != NULL){
                proc_info_count++;
        }

        if(verbose_print){
                printf("enabling debug feature for forwarding stdout/stderr\n");
                proc_info_count+=2;
                /* add PMIX_FWD_STDOUT and PMIX_FWD_STDERR later*/

        }

        if(experimental){
                job_info_count++;
        }
        if(node_count == 1){
                job_info_count++;
        }
        /*--END: add all proc level infos */


        /*--START: append actual proc level info */
        PMIX_INFO_CREATE(job_info, job_info_count);
        PMIX_INFO_CREATE(proc_info, proc_info_count);
        /* PMIX_VAL_set_assign(_v, _field, _val )  */
        /* PMIX_VAL_set_strdup(_v, _field, _val )  */

        if(host_to_use != NULL){
                /* add info struct to the spawned app itself for the host */

                /* old way */
                strncpy(proc_info[proc_info_index].key, PMIX_HOST, PMIX_MAX_KEYLEN);
                //proc_info[proc_info_index].value.type = PMIX_STRING;
                /* set the data for host list to use */
                //proc_info[proc_info_index].value.data.string = host_to_use;
                /* end old way */
                if(verbose_print) printf("about to set host val\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), string,
                                    host_to_use );
                proc_info_index++;
        }

        if(!pmix_mode){
                strncpy(job_info[job_info_index].key, PMIX_NON_PMI,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set non pmix flag\n");
                PMIX_VAL_SET(&(job_info[job_info_index].value), flag, true);
                job_info_index++;
        }
        if(verbose_print){
                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDOUT,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stdout flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;

                strncpy(proc_info[proc_info_index].key, PMIX_FWD_STDERR,
                        PMIX_MAX_KEYLEN);
                if(verbose_print) printf("about to set stderr flag\n");
                PMIX_VAL_SET(&(proc_info[proc_info_index].value), flag, true );
                proc_info_index++;
        }
        if(experimental){
                printf("attempting to perform experiment\n");
                bool local_flag = true;
                PMIX_INFO_LOAD(&job_info[job_info_index], PMIX_NOTIFY_COMPLETION, &local_flag, PMIX_BOOL);
                job_info_index++;
        }
        if(node_count == 1){
                strncpy(job_info[job_info_index].key, PMIX_PPR,
                        PMIX_MAX_KEYLEN);
                PMIX_VAL_SET(&(job_info[job_info_index].value), string,
                             "1:n");
                job_info_index++;
        }
        /*--END: append actual proc level info */

        /* sanity check to make sure we covered all the info structs */
        if(proc_info_index != proc_info_count ){
                printf("bug: mismatch with appending proc info\n");
                exit(1);
        }
        if(job_info_index != job_info_count){
                printf("bug: mismatch with appending job info\n");
                exit(1);
        }

        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */

        /* fill in job_info */
        /*
        strncpy(job_info[0].key, PMIX_TIMEOUT, PMIX_MAX_KEYLEN);
        job_info[0].value.type = PMIX_INT;
        job_info[0].value.data.integer = 10; */

        /* strncpy(job_info[0].key, PMIX_NOTIFY_COMPLETION, PMIX_MAX_KEYLEN);
           job_info[0].value.type = PMIX_BOOL;
           job_info[0].value.data.flag = true; */

        /*strncpy(spawned_app->info[0].key, PMIX_DISPLAY_MAP, PMIX_MAX_KEYLEN);
          job_info[0].value.type = PMIX_BOOL;
          job_info[0].value.data.flag = true;*/


        /* TODO: TEST PMIX_NOTIFY_COMPLETION WHEN IT'S IMPLEMENTED IN PMIX */
        spawned_app->info = proc_info;
        spawned_app->ninfo = proc_info_count;

        if(verbose_print){
                printf("proc level info count: %d\n", proc_info_count);
        }
        /* call spawn */
        retval = PMIx_Spawn(job_info, job_info_count,
                            spawned_app, 1, spawned_nsp);

        if(verbose_print) {
                printf("rank %d (host %s) just called spawn; spawned nspace: %s, retval:%d\n",
                       main_proc.rank,
                       hostn,
                       spawned_nsp,
                       retval);
        }
        if(retval != PMIX_SUCCESS){
                error_helper(retval,  hostn, "error with spawn");
                goto done;
        }

        /* TODO: TEMPORARY WORKAROUND TO WAIT FOR A SPAWNED PROCESS */
        if(blocking_mode){

                sleep(fixed_sleep);

                /* wait until app completes: */
                while(!done_flag){
                        sleep(fixed_sleep);
                        temp_counter++;
                        if(temp_counter*fixed_sleep >= sleep_max) {
                                if(verbose_print) printf("broke out early\n");
                                break;
                        }
                }
                if(verbose_print){
                        if(done_flag == true) {
                                printf("done_flag was set to true!\n");
                        }
                }

        }

done:
        /* fence first */
        retval = fence_helper();
        if(retval != PMIX_SUCCESS){
                if(verbose_print) printf("error fencing, finalize may fail ! \n");
        }
        /* finalize */

        PMIx_Deregister_event_handler(_g_errhandler_ref, NULL, NULL);

        if(verbose_print){
                fprintf(stdout,
                        "spawn master process (rank %d) (host %s) finalizing\n",
                        main_proc.rank,
                        hostn);
        }

        /* clean up pmix */

        retval = PMIx_tool_finalize();

        if(retval == PMIX_SUCCESS)
        {
                if(verbose_print){
                        printf("spawn master process %d finalize success\n\n",
                               main_proc.rank);
                }
        }
        else
        {
                printf("spawn master process %d pmix_finalize FAILURE: %d\n\n",
                       main_proc.rank,
                       retval);
        }

        retval = PMIx_Finalize(NULL, 0);
        fflush(stdout);

        /*  cleanup before returning */
        PMIX_INFO_FREE(job_info, job_info_count);
        spawned_app->argv = NULL;
        PMIX_APP_FREE(spawned_app, 1);
        if(verbose_print) printf("%s exiting cleanly :)\n", argv[0]);
        return 0;

}
示例#15
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t *info;
    size_t ninfo;
    pmix_query_t *query;
    size_t nq, n;
    myquery_data_t myquery_data;
    pid_t pid;
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;
    mylock_t mylock;
    myrel_t myrel;
    uint16_t localrank;
    char *target = NULL;

    pid = getpid();

    /* init us - since we were launched by the RM, our connection info
     * will have been provided at startup. */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
        fprintf(stderr, "Debugger daemon: PMIx_tool_init failed: %d\n", rc);
        exit(0);
    }
    fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid);


    /* register our default event handler */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    if (PMIX_SUCCESS != mylock.status) {
        rc = mylock.status;
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_DESTRUCT_LOCK(&mylock);

    /* get the nspace of the job we are to debug - it will be in our JOB info */
#ifdef PMIX_LOAD_PROCID
    PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD);
#else
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_KEYLEN);
    proc.rank = PMIX_RANK_WILDCARD;
#endif
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - error %s\n",
                myproc.nspace, myproc.rank,
                (unsigned long)pid, PMIx_Error_string(rc));
        goto done;
    }
    if (NULL == val || PMIX_STRING != val->type || NULL == val->data.string) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - NULL data returned\n",
                myproc.nspace, myproc.rank, (unsigned long)pid);
        goto done;
    }
    /* save it for later */
    target = strdup(val->data.string);
    PMIX_VALUE_RELEASE(val);

    fprintf(stderr, "[%s:%d:%lu] Debugging %s\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, target);

    /* get my local rank so I can determine which local proc is "mine"
     * to debug */
    val = NULL;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_LOCAL_RANK, NULL, 0, &val))) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - error %s\n",
                myproc.nspace, myproc.rank,
                (unsigned long)pid, PMIx_Error_string(rc));
        goto done;
    }
    if (NULL == val) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - NULL data returned\n",
                myproc.nspace, myproc.rank, (unsigned long)pid);
        goto done;
    }
    if (PMIX_UINT16 != val->type) {
        fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - returned wrong type %s\n",
                myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Data_type_string(val->type));
        goto done;
    }
    /* save the data */
    localrank = val->data.uint16;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "[%s:%d:%lu] my local rank %d\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, (int)localrank);

    /* register another handler specifically for when the target
     * job completes */
    DEBUG_CONSTRUCT_LOCK(&myrel.lock);
    myrel.nspace = strdup(proc.nspace);
    PMIX_INFO_CREATE(info, 2);
    PMIX_INFO_LOAD(&info[0], PMIX_EVENT_RETURN_OBJECT, &myrel, PMIX_POINTER);
    /* only call me back when this specific job terminates */
    PMIX_LOAD_PROCID(&proc, target, PMIX_RANK_WILDCARD);
    PMIX_INFO_LOAD(&info[1], PMIX_EVENT_AFFECTED_PROC, &proc, PMIX_PROC);

    fprintf(stderr, "[%s:%d:%lu] registering for termination of %s\n", myproc.nspace, myproc.rank,
            (unsigned long)pid, proc.nspace);


    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(&code, 1, info, 2,
                                release_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    if (PMIX_SUCCESS != mylock.status) {
        rc = mylock.status;
        DEBUG_DESTRUCT_LOCK(&mylock);
        PMIX_INFO_FREE(info, 2);
        goto done;
    }
    DEBUG_DESTRUCT_LOCK(&mylock);
    PMIX_INFO_FREE(info, 2);

    /* get our local proctable - for scalability reasons, we don't want to
     * have our "root" debugger process get the proctable for everybody and
     * send it out to us. So ask the local PMIx server for the pid's of
     * our local target processes */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE);
    query[0].nqual = 1;
    PMIX_INFO_CREATE(query[0].qualifiers, 1);
    PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, target, PMIX_STRING);  // the nspace we are enquiring about
    /* setup the caddy to retrieve the data */
    DEBUG_CONSTRUCT_LOCK(&myquery_data.lock);
    myquery_data.info = NULL;
    myquery_data.ninfo = 0;
    /* execute the query */
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
        fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
        goto done;
    }
    DEBUG_WAIT_THREAD(&myquery_data.lock);
    DEBUG_DESTRUCT_LOCK(&myquery_data.lock);
    PMIX_QUERY_FREE(query, nq);
    if (PMIX_SUCCESS != myquery_data.status) {
        rc = myquery_data.status;
        goto done;
    }

    fprintf(stderr, "[%s:%d:%lu] Local proctable received\n", myproc.nspace, myproc.rank, (unsigned long)pid);


    /* now that we have the proctable for our local processes, we can do our
     * magic debugger stuff and attach to them. We then send a "release" event
     * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We
     * do this with the event notification system. For this example, we just
     * send it to all local procs of the job being debugged */
    (void)strncpy(proc.nspace, target, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    ninfo = 2;
    PMIX_INFO_CREATE(info, ninfo);
    PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC);  // deliver to the target nspace
    PMIX_INFO_LOAD(&info[1], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL);  // deliver to the target nspace
    fprintf(stderr, "[%s:%u:%lu] Sending release\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    rc = PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
                           NULL, PMIX_RANGE_CUSTOM,
                           info, ninfo, NULL, NULL);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "%s[%s:%u:%lu] Sending release failed with error %s(%d)\n",
                myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Error_string(rc), rc);
        goto done;
    }

    /* do some debugger magic while waiting for the job to terminate */
    DEBUG_WAIT_THREAD(&myrel.lock);

  done:
    if (NULL != target) {
        free(target);
    }
    /* finalize us */
    fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Finalizing\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank, (unsigned long)pid);
    }
    fflush(stderr);
    return(0);
}
示例#16
0
文件: client.c 项目: jsquyres/ompi
int main(int argc, char **argv)
{
    int rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    char *tmp;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    pmix_info_t *info;
    bool flag;
    volatile int active;
    pmix_status_t dbg = PMIX_ERR_DEBUGGER_RELEASE;

    /* init us - note that the call to "init" includes the return of
     * any job-related info provided by the RM. This includes any
     * debugger flag instructing us to stop-in-init. If such a directive
     * is included, then the process will be stopped in this call until
     * the "debugger release" notification arrives */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);


    /* register our default event handler - again, this isn't strictly
     * required, but is generally good practice */
    active = -1;
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&active);
    while (-1 == active) {
        sleep(1);
    }
    if (0 != active) {
        fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
        exit(active);
    }

    /* job-related info is found in our nspace, assigned to the
     * wildcard rank as it doesn't relate to a specific rank. Setup
     * a name to retrieve such values */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;

    /* check to see if we have been instructed to wait for a debugger
     * to attach to us. We won't get both a stop-in-init AND a
     * wait-for-notify directive, so we should never stop twice. This
     * directive is provided so that something like an MPI implementation
     * can do some initial setup in MPI_Init prior to pausing for the
     * debugger */
    if (PMIX_SUCCESS == (rc = PMIx_Get(&proc, PMIX_DEBUG_WAIT_FOR_NOTIFY, NULL, 0, &val))) {
        /* register for debugger release */
        active = -1;
        PMIx_Register_event_handler(&dbg, 1, NULL, 0,
                                    release_fn, evhandler_reg_callbk, (void*)&active);
        /* wait for registration to complete */
        while (-1 == active) {
            sleep(1);
        }
        if (0 != active) {
            fprintf(stderr, "[%s:%d] Debug handler registration failed\n", myproc.nspace, myproc.rank);
            exit(active);
        }
        /* wait for debugger release */
        while (waiting_for_debugger) {
            sleep(1);
        }
    }

    /* get our universe size */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, val->data.uint32);
    /* get the number of procs in our job - univ size is the total number of allocated
     * slots, not the number of procs in the job */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get job size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "Client %s:%d num procs %d\n", myproc.nspace, myproc.rank, nprocs);

    /* put a few values */
    if (0 > asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_UINT32;
    value.data.uint32 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Store_internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_UINT64;
    value.data.uint64 = 1234;
    if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank)) {
        exit(1);
    }
    value.type = PMIX_STRING;
    value.data.string = "1234";
    if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Put internal failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    free(tmp);

    /* push the data to our PMIx server */
    if (PMIX_SUCCESS != (rc = PMIx_Commit())) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Commit failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* call fence to synchronize with our peers - instruct
     * the fence operation to collect and return all "put"
     * data from our peers */
    PMIX_INFO_CREATE(info, 1);
    flag = true;
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    PMIX_INFO_FREE(info, 1);

    /* check the returned data */
    for (n=0; n < nprocs; n++) {
        if (0 > asprintf(&tmp, "%s-%d-local", myproc.nspace, myproc.rank)) {
            exit(1);
        }
        if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, tmp, NULL, 0, &val))) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, myproc.rank, tmp, rc);
            goto done;
        }
        if (PMIX_UINT64 != val->type) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        if (1234 != val->data.uint64) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong value: %d\n", myproc.nspace, myproc.rank, tmp, (int)val->data.uint64);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned correct\n", myproc.nspace, myproc.rank, tmp);
        PMIX_VALUE_RELEASE(val);
        free(tmp);
        if (0 > asprintf(&tmp, "%s-%d-remote", myproc.nspace, myproc.rank)) {
            exit(1);
        }
        if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, tmp, NULL, 0, &val))) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s failed: %d\n", myproc.nspace, myproc.rank, tmp, rc);
            goto done;
        }
        if (PMIX_STRING != val->type) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong type: %d\n", myproc.nspace, myproc.rank, tmp, val->type);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        if (0 != strcmp(val->data.string, "1234")) {
            fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned wrong value: %s\n", myproc.nspace, myproc.rank, tmp, val->data.string);
            PMIX_VALUE_RELEASE(val);
            free(tmp);
            goto done;
        }
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get %s returned correct\n", myproc.nspace, myproc.rank, tmp);
        PMIX_VALUE_RELEASE(val);
        free(tmp);
    }

 done:
    /* finalize us */
    fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
示例#17
0
static pmix_status_t validate_cred(struct pmix_peer_t *peer,
                                   const pmix_info_t directives[], size_t ndirs,
                                   pmix_info_t **info, size_t *ninfo,
                                   const pmix_byte_object_t *cred)
{
    pmix_peer_t *pr = (pmix_peer_t*)peer;

#if defined(SO_PEERCRED)
#ifdef HAVE_STRUCT_SOCKPEERCRED_UID
#define HAVE_STRUCT_UCRED_UID
    struct sockpeercred ucred;
#else
    struct ucred ucred;
#endif
    socklen_t crlen = sizeof (ucred);
#endif
    uid_t euid = -1;
    gid_t egid = -1;
    char *ptr;
    size_t ln;
    bool takeus;
    char **types;
    size_t n, m;
    uint32_t u32;

    pmix_output_verbose(2, pmix_globals.debug_output,
                        "psec: native validate_cred %s",
                        (NULL == cred) ? "NULL" : "NON-NULL");

    if (PMIX_PROTOCOL_V1 == pr->protocol) {
        /* usock protocol - get the remote side's uid/gid */
#if defined(SO_PEERCRED) && (defined(HAVE_STRUCT_UCRED_UID) || defined(HAVE_STRUCT_UCRED_CR_UID))
        /* Ignore received 'cred' and validate ucred for socket instead. */
        pmix_output_verbose(2, pmix_globals.debug_output,
                            "psec:native checking getsockopt on socket %d for peer credentials", pr->sd);
        if (getsockopt(pr->sd, SOL_SOCKET, SO_PEERCRED, &ucred, &crlen) < 0) {
            pmix_output_verbose(2, pmix_globals.debug_output,
                                "psec: getsockopt SO_PEERCRED failed: %s",
                                strerror (pmix_socket_errno));
            return PMIX_ERR_INVALID_CRED;
        }
#if defined(HAVE_STRUCT_UCRED_UID)
        euid = ucred.uid;
        egid = ucred.gid;
#else
        euid = ucred.cr_uid;
        egid = ucred.cr_gid;
#endif

#elif defined(HAVE_GETPEEREID)
        pmix_output_verbose(2, pmix_globals.debug_output,
                            "psec:native checking getpeereid on socket %d for peer credentials", pr->sd);
        if (0 != getpeereid(pr->sd, &euid, &egid)) {
            pmix_output_verbose(2, pmix_globals.debug_output,
                                "psec: getsockopt getpeereid failed: %s",
                                strerror (pmix_socket_errno));
            return PMIX_ERR_INVALID_CRED;
    }
#else
        return PMIX_ERR_NOT_SUPPORTED;
#endif
    } else if (PMIX_PROTOCOL_V2 == pr->protocol) {
        /* this is a tcp protocol, so the cred is actually the uid/gid
         * passed upwards from the client */
        if (NULL == cred) {
            /* not allowed */
            return PMIX_ERR_INVALID_CRED;
        }
        ln = cred->size;
        euid = 0;
        egid = 0;
        if (sizeof(uid_t) <= ln) {
            memcpy(&euid, cred->bytes, sizeof(uid_t));
            ln -= sizeof(uid_t);
            ptr = cred->bytes + sizeof(uid_t);
        } else {
            return PMIX_ERR_INVALID_CRED;
        }
        if (sizeof(gid_t) <= ln) {
            memcpy(&egid, ptr, sizeof(gid_t));
        } else {
            return PMIX_ERR_INVALID_CRED;
        }
    } else if (PMIX_PROTOCOL_UNDEF != pr->protocol) {
        /* don't recognize the protocol */
        return PMIX_ERR_NOT_SUPPORTED;
    }

    /* if we are responding to a local request to validate a credential,
     * then see if they specified a mechanism */
    if (NULL != directives && 0 < ndirs) {
        for (n=0; n < ndirs; n++) {
            if (0 == strncmp(directives[n].key, PMIX_CRED_TYPE, PMIX_MAX_KEYLEN)) {
                /* split the specified string */
                types = pmix_argv_split(directives[n].value.data.string, ',');
                takeus = false;
                for (m=0; NULL != types[m]; m++) {
                    if (0 == strcmp(types[m], "native")) {
                        /* it's us! */
                        takeus = true;
                        break;
                    }
                }
                pmix_argv_free(types);
                if (!takeus) {
                    return PMIX_ERR_NOT_SUPPORTED;
                }
            }
        }
    }

    /* check uid */
    if (euid != pr->info->uid) {
        pmix_output_verbose(2, pmix_globals.debug_output,
                            "psec: socket cred contains invalid uid %u", euid);
        return PMIX_ERR_INVALID_CRED;
    }

    /* check gid */
    if (egid != pr->info->gid) {
        pmix_output_verbose(2, pmix_globals.debug_output,
                            "psec: socket cred contains invalid gid %u", egid);
        return PMIX_ERR_INVALID_CRED;
    }

    /* validated - mark that we did it */
    if (NULL != info) {
        PMIX_INFO_CREATE(*info, 3);
        if (NULL == *info) {
            return PMIX_ERR_NOMEM;
        }
        *ninfo = 3;
        /* mark that this came from us */
        PMIX_INFO_LOAD(info[0], PMIX_CRED_TYPE, "munge", PMIX_STRING);
        /* provide the uid it contained */
        u32 = euid;
        PMIX_INFO_LOAD(info[1], PMIX_USERID, &u32, PMIX_UINT32);
        /* provide the gid it contained */
        u32 = egid;
        PMIX_INFO_LOAD(info[2], PMIX_GRPID, &u32, PMIX_UINT32);
    }
    return PMIX_SUCCESS;

}
示例#18
0
文件: jctrl.c 项目: bosilca/ompi
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_value_t value;
    pmix_value_t *val = &value;
    pmix_proc_t proc;
    uint32_t nprocs, n;
    pmix_info_t *info, *iptr;
    bool flag;
    mylock_t mylock;
    pmix_data_array_t *dptr;

    /* init us - note that the call to "init" includes the return of
     * any job-related info provided by the RM. */
    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Init failed: %d\n", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    fprintf(stderr, "Client ns %s rank %d: Running\n", myproc.nspace, myproc.rank);


    /* register our default event handler - again, this isn't strictly
     * required, but is generally good practice */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    /* wait for registration to complete */
    DEBUG_WAIT_THREAD(&mylock);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "[%s:%d] Default handler registration failed\n", myproc.nspace, myproc.rank);
        goto done;
    }

    /* job-related info is found in our nspace, assigned to the
     * wildcard rank as it doesn't relate to a specific rank. Setup
     * a name to retrieve such values */
    PMIX_PROC_CONSTRUCT(&proc);
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;

    /* get our universe size */
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Get universe size failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    nprocs = val->data.uint32;
    PMIX_VALUE_RELEASE(val);
    fprintf(stderr, "Client %s:%d universe size %d\n", myproc.nspace, myproc.rank, nprocs);

    /* inform the RM that we are preemptible, and that our checkpoint methods are
     * "signal" on SIGUSR2 and event on PMIX_JCTRL_CHECKPOINT */
    PMIX_INFO_CREATE(info, 2);
    flag = true;
    PMIX_INFO_LOAD(&info[0], PMIX_JOB_CTRL_PREEMPTIBLE, (void*)&flag, PMIX_BOOL);
    /* can't use "load" to load a pmix_data_array_t */
    (void)strncpy(info[1].key, PMIX_JOB_CTRL_CHECKPOINT_METHOD, PMIX_MAX_KEYLEN);
    PMIX_DATA_ARRAY_CREATE(info[1].value.data.darray, 2, PMIX_INFO);
    dptr = info[1].value.data.darray;
    rc = SIGUSR2;
    iptr = (pmix_info_t*)dptr->array;
    PMIX_INFO_LOAD(&iptr[0], PMIX_JOB_CTRL_CHECKPOINT_SIGNAL, &rc, PMIX_INT);
    rc = PMIX_JCTRL_CHECKPOINT;
    PMIX_INFO_LOAD(&iptr[1], PMIX_JOB_CTRL_CHECKPOINT_EVENT, &rc, PMIX_STATUS);

    /* since this is informational and not a requested operation, the target parameter
     * doesn't mean anything and can be ignored */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != (rc = PMIx_Job_control_nb(NULL, 0, info, 2, infocbfunc, (void*)&mylock))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_WAIT_THREAD(&mylock);
    PMIX_INFO_FREE(info, 2);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Job_control_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* now request that this process be monitored using heartbeats */
    PMIX_INFO_CREATE(iptr, 1);
    PMIX_INFO_LOAD(&iptr[0], PMIX_MONITOR_HEARTBEAT, NULL, PMIX_POINTER);

    PMIX_INFO_CREATE(info, 3);
    PMIX_INFO_LOAD(&info[0], PMIX_MONITOR_ID, "MONITOR1", PMIX_STRING);
    n = 5;  // require a heartbeat every 5 seconds
    PMIX_INFO_LOAD(&info[1], PMIX_MONITOR_HEARTBEAT_TIME, &n, PMIX_UINT32);
    n = 2;  // two heartbeats can be missed before declaring us "stalled"
    PMIX_INFO_LOAD(&info[2], PMIX_MONITOR_HEARTBEAT_DROPS, &n, PMIX_UINT32);

    /* make the request */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != (rc = PMIx_Process_monitor_nb(iptr, PMIX_MONITOR_HEARTBEAT_ALERT,
                                                      info, 3, infocbfunc, (void*)&mylock))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        DEBUG_DESTRUCT_LOCK(&mylock);
        goto done;
    }
    DEBUG_WAIT_THREAD(&mylock);
    PMIX_INFO_FREE(iptr, 1);
    PMIX_INFO_FREE(info, 3);
    rc = mylock.status;
    DEBUG_DESTRUCT_LOCK(&mylock);
    if (PMIX_SUCCESS != rc) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Process_monitor_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }

    /* send a heartbeat */
    PMIx_Heartbeat();

    /* call fence to synchronize with our peers - no need to
     * collect any info as we didn't "put" anything */
    PMIX_INFO_CREATE(info, 1);
    flag = false;
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, info, 1))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Fence failed: %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    PMIX_INFO_FREE(info, 1);


  done:
    /* finalize us */
    fprintf(stderr, "Client ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
示例#19
0
static int attach_to_running_job(char *nspace)
{
    pmix_status_t rc;
    pmix_proc_t myproc;
    pmix_query_t *query;
    size_t nq;
    myquery_data_t *q;

    /* query the active nspaces so we can verify that the
     * specified one exists */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_NAMESPACES);

    q = (myquery_data_t*)malloc(sizeof(myquery_data_t));
    DEBUG_CONSTRUCT_LOCK(&q->lock);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)q))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    DEBUG_WAIT_THREAD(&q->lock);
    DEBUG_DESTRUCT_LOCK(&q->lock);

    if (NULL == q->info) {
        fprintf(stderr, "Query returned no info\n");
        return -1;
    }
    /* the query should have returned a comma-delimited list of nspaces */
    if (PMIX_STRING != q->info[0].value.type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", q->info[0].value.type);
        return -1;
    }
    if (NULL == q->info[0].value.data.string) {
        fprintf(stderr, "Query returned no active nspaces\n");
        return -1;
    }

    fprintf(stderr, "Query returned %s\n", q->info[0].value.data.string);
    return 0;

#if 0
    /* split the returned string and look for the given nspace */

    /* if not found, then we have an error */
    PMIX_INFO_FREE(info, ninfo);

    /* get the proctable for this nspace */
    ninfo = 1;
    PMIX_INFO_CREATE(info, ninfo);
    (void)strncpy(info[0].key, PMIX_QUERY_PROC_TABLE, PMIX_MAX_KEYLEN);
    (void)strncpy(info[0].qualifier, nspace, PMIX_MAX_KEYLEN);
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(info, ninfo, infocbfunc, (void*)&active))) {
        fprintf(stderr, "Client ns %s rank %d: PMIx_Query_info_nb failed: %d\n", myproc.nspace, myproc.rank, rc);
        return -1;
    }
    /* wait to get a response */

    /* the query should have returned a data_array */
    if (PMIX_DATA_ARRAY != info[0].type) {
        fprintf(stderr, "Query returned incorrect data type: %d\n", info[0].type);
        return -1;
    }
    if (NULL == info[0].data.darray.array) {
        fprintf(stderr, "Query returned no proctable info\n");
        return -1;
    }
    /* the data array consists of a struct:
     *     size_t size;
     *     void* array;
     *
     * In this case, the array is composed of pmix_proc_info_t structs:
     *     pmix_proc_t proc;   // contains the nspace,rank of this proc
     *     char* hostname;
     *     char* executable_name;
     *     pid_t pid;
     *     int exit_code;
     *     pmix_proc_state_t state;
     */

    /* this is where a debugger tool would process the proctable to
     * create whatever blob it needs to provide to its daemons */
    PMIX_INFO_FREE(info, ninfo);

    /* setup the debugger daemon spawn request */
    napps = 1;
    PMIX_APP_CREATE(app, napps);
    /* setup the name of the daemon executable to launch */
    app[0].cmd = strdup("debuggerdaemon");
    app[0].argc = 1;
    app[0].argv = (char**)malloc(2*sizeof(char*));
    app[0].argv[0] = strdup("debuggerdaemon");
    app[0].argv[1] = NULL;
    /* provide directives so the daemons go where we want, and
     * let the RM know these are debugger daemons */
    ninfo = 3;
    PMIX_INFO_CREATE(app[0].info, ninfo);
    PMIX_INFO_LOAD(&app[0].info[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING);  // instruct the RM to launch one copy of the executable on each node
    PMIX_INFO_LOAD(&app[0].info[1], PMIX_DEBUGGER_DAEMONS, true, PMIX_BOOL); // these are debugger daemons
    PMIX_INFO_LOAD(&app[0].info[2], PMIX_DEBUG_TARGET, nspace, PMIX_STRING); // the "jobid" of the application to be debugged

    /* spawn the daemons */
    PMIx_Spawn(NULL, 0, app, napps, dspace);
    /* cleanup */
    PMIX_APP_FREE(app, napps);

    /* this is where a debugger tool would wait until the debug operation is complete */

    return 0;
#endif
}
示例#20
0
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_info_t *info, *iptr;
    pmix_app_t *app;
    size_t ninfo, napps;
    char *nspace = NULL;
    int i;
    pmix_query_t *query;
    size_t nq, n;
    myquery_data_t myquery_data;
    bool cospawn = false, stop_on_exec = false, cospawn_reqd = false;
    char cwd[1024];
    pmix_status_t code = PMIX_ERR_JOB_TERMINATED;
    mylock_t mylock;
    myrel_t myrel, launcher_ready, dbrel;
    pid_t pid;
    pmix_envar_t envar;
    char *launchers[] = {
        "prun",
        "mpirun",
        "mpiexec",
        "orterun",
        NULL
    };
    pmix_proc_t proc;
    bool found;
    pmix_data_array_t darray;
    char *tmp;
    char clientspace[PMIX_MAX_NSLEN+1];

    pid = getpid();

    /* Process any arguments we were given */
    for (i=1; i < argc; i++) {
        if (0 == strcmp(argv[i], "-h") ||
            0 == strcmp(argv[i], "--help")) {
            /* print the usage message and exit */

        }
        if (0 == strcmp(argv[i], "-a") ||
            0 == strcmp(argv[i], "--attach")) {
            if (NULL != nspace) {
                /* can only support one */
                fprintf(stderr, "Cannot attach to more than one nspace\n");
                exit(1);
            }
            /* the next argument must be the nspace */
            ++i;
            if (argc == i) {
                /* they goofed */
                fprintf(stderr, "The %s option requires an <nspace> argument\n", argv[i]);
                exit(1);
            }
            nspace = strdup(argv[i]);
        } else if (0 == strcmp(argv[i], "-c") ||
                   0 == strcmp(argv[i], "--cospawn")){
            cospawn_reqd = true;
            break;
        }
    }
    info = NULL;
    ninfo = 0;

    /* use the system connection first, if available */
    PMIX_INFO_CREATE(info, 1);
    PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_SYSTEM_FIRST, NULL, PMIX_BOOL);
    /* init as a tool */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) {
        fprintf(stderr, "PMIx_tool_init failed: %s(%d)\n", PMIx_Error_string(rc), rc);
        exit(rc);
    }
    PMIX_INFO_FREE(info, ninfo);

    fprintf(stderr, "Debugger ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid);

    /* construct the debugger termination release */
    DEBUG_CONSTRUCT_LOCK(&dbrel.lock);

    /* register a default event handler */
    DEBUG_CONSTRUCT_LOCK(&mylock);
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&mylock);
    DEBUG_WAIT_THREAD(&mylock);
    DEBUG_DESTRUCT_LOCK(&mylock);

    /* if we are attaching to a running job, then attach to it */
    if (NULL != nspace) {
        if (PMIX_SUCCESS != (rc = attach_to_running_job(nspace))) {
            fprintf(stderr, "Failed to attach to nspace %s: error code %d\n",
                    nspace, rc);
            goto done;
        }
    }


  done:
    DEBUG_DESTRUCT_LOCK(&myrel.lock);
    DEBUG_DESTRUCT_LOCK(&dbrel.lock);
    PMIx_tool_finalize();

    return(rc);
}
示例#21
0
文件: error.c 项目: garlick/pmix
void pmix_errhandler_invoke(pmix_status_t status,
                            pmix_proc_t procs[], size_t nprocs,
                            pmix_info_t info[], size_t ninfo)
{
    /* We need to parse thru each registered handler and determine
     * which one to call for the specific error */
    int i, idflt;
    size_t j, k;
    bool fired = false;
    bool exact_match;
    pmix_error_reg_info_t *errreg, *errdflt=NULL;
    pmix_info_t *iptr;

    /* we will need to provide the errhandler reference id when
     * we provide the callback. Since the callback function doesn't
     * provide a param for that purpose, we have to add it to any
     * info array that came from the RM, so extend the array by 1 */
    PMIX_INFO_CREATE(iptr, ninfo+1);
    /* put the reference id in the first location */
    (void)strncpy(iptr[0].key, PMIX_ERROR_HANDLER_ID, PMIX_MAX_KEYLEN);
    iptr[0].value.type = PMIX_INT;
    /* we don't know the reference id yet, but we'll fill that in
     * later - for now, just copy the incoming info array across */
    if (NULL != info) {
        for (j=0; j < ninfo; j++) {
            PMIX_INFO_LOAD(&iptr[j+1], info[j].key, &info[j].value.data, info[j].value.type);
        }
    }

    /* search our array of errhandlers for a match. We take any specific
     * error status first, then take the group of the incoming status next.
     * If neither of those have been registered, then use any default
     * errhandler - otherwise, ignore it */
    for (i = 0; i < pmix_globals.errregs.size; i++) {
        if (NULL == (errreg = (pmix_error_reg_info_t*) pmix_pointer_array_get_item(&pmix_globals.errregs, i))) {
            continue;
        }
        if (NULL == errreg->info || 0 == errreg->ninfo) {
            // this is a general err handler - we will call it if there is no better match
            errdflt = errreg;
            idflt = i;
            continue;
        }
        iptr[0].value.data.integer = i;
        /* match error name key first */
        exact_match = false;
        for (j = 0; j < errreg->ninfo; j++) {
            if ((0 == strcmp(errreg->info[j].key, PMIX_ERROR_NAME)) &&
                (status == errreg->info[j].value.data.int32)) {
                    iptr[0].value.data.integer = i;
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                    exact_match = true;
                    break;
            }
        }
        if (!exact_match && NULL != info) {
            /* if no exact match was found, then we will fire the errhandler
             * for any matching info key. This may be too lax and need to be adjusted
             * later */
            for (k = 0; k < errreg->ninfo; k++) {
                if ((0 == strcmp(errreg->info[j].key, info[k].key)) &&
                    (pmix_value_cmp(&errreg->info[j].value, &info[k].value))) {
                    errreg->errhandler(status, procs, nprocs, iptr, ninfo+1);
                    fired = true;
                }
            }
        }
    }

    /* if nothing fired and we found a general err handler, then fire it */
    if (!fired && NULL != errdflt) {
        iptr[0].value.data.integer = idflt;
        errdflt->errhandler(status, procs, nprocs, iptr, ninfo+1);
    }
    /* cleanup */
    PMIX_INFO_FREE(iptr, ninfo+1);
}
示例#22
0
文件: pmi2.c 项目: thananon/ompi
PMIX_EXPORT int PMI2_Init(int *spawned, int *size, int *rank, int *appnum)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;
    pmix_proc_t proc = myproc;
    proc.rank = PMIX_RANK_WILDCARD;

    if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, NULL, 0))) {
        /* if we didn't see a PMIx server (e.g., missing envar),
         * then allow us to run as a singleton */
        if (PMIX_ERR_INVALID_NAMESPACE == rc) {
            if (NULL != spawned) {
                *spawned = 0;
            }
            if (NULL != size) {
                *size = 1;
            }
            if (NULL != rank) {
                *rank = 0;
            }
            if (NULL != appnum) {
                *appnum = 0;
            }
            pmi2_singleton = true;
            (void)strncpy(myproc.nspace, "1234", PMIX_MAX_NSLEN);
            myproc.rank = 0;
            pmi2_init = 1;
            return PMI2_SUCCESS;
        }
        return PMI2_ERR_INIT;
    }

    /* get the rank */
    *rank = myproc.rank;

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    if (NULL != size) {
        /* get the universe size - this will likely pull
         * down all attributes assigned to the job, thus
         * making all subsequent "get" operations purely
         * local */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_UNIV_SIZE, info, 1, &val)) {
            rc = convert_int(size, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* cannot continue without this info */
            rc = PMIX_ERR_INIT;
            goto error;
        }
    }

    if (NULL != spawned) {
        /* get the spawned flag */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_SPAWNED, info, 1, &val)) {
            rc = convert_int(spawned, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* if not found, default to not spawned */
            *spawned = 0;
        }
    }

    if (NULL != appnum) {
        /* get our appnum */
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_APPNUM, info, 1, &val)) {
            rc = convert_int(appnum, val);
            PMIX_VALUE_RELEASE(val);
            if (PMIX_SUCCESS != rc) {
                goto error;
            }
        } else {
            /* if not found, default to 0 */
            *appnum = 0;
        }
    }
    pmi2_init = 1;

    rc = PMIX_SUCCESS;

error:
    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#23
0
int MPIDU_bc_table_create(int rank, int size, int *nodemap, void *bc, int bc_len, int same_len,
                          int roots_only, void **bc_table, size_t ** bc_indices)
{
    int rc, mpi_errno = MPI_SUCCESS;
    int start, end, i;
    char *val = NULL, *val_p;
    int out_len, val_len, rem, flag;
    pmix_value_t value, *pvalue;
    pmix_info_t *info;
    pmix_proc_t proc;
    int local_rank, local_leader;
    size_t my_bc_len = bc_len;

    MPIR_NODEMAP_get_local_info(rank, size, nodemap, &local_size, &local_rank, &local_leader);

    /* if business cards can be different length, use the max value length */
    if (!same_len)
        bc_len = VALLEN;
    mpi_errno = MPIDU_shm_seg_alloc(bc_len * size, (void **) &segment, MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);
    mpi_errno =
        MPIDU_shm_seg_commit(&memory, &barrier, local_size, local_rank, local_leader, rank,
                             MPL_MEM_ADDRESS);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

    if (size == 1) {
        memcpy(segment, bc, my_bc_len);
        goto single;
    }

    val = MPL_malloc(VALLEN, MPL_MEM_ADDRESS);
    memset(val, 0, VALLEN);
    val_p = val;
    rem = VALLEN;
    rc = MPL_str_add_binary_arg(&val_p, &rem, "mpi", (char *) bc, my_bc_len);
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**buscard");
    MPIR_Assert(rem >= 0);

    if (!roots_only || rank == local_leader) {
        value.type = PMIX_STRING;
        value.data.string = val;
        rc = PMIx_Put(PMIX_LOCAL, "bc", &value);
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_put");
        rc = PMIx_Put(PMIX_REMOTE, "bc", &value);
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_put");
        rc = PMIx_Commit();
        MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_commit");
    }

    PMIX_INFO_CREATE(info, 1);
    PMIX_INFO_LOAD(info, PMIX_COLLECT_DATA, &flag, PMIX_BOOL);
    rc = PMIx_Fence(&MPIR_Process.pmix_wcproc, 1, info, 1);
    MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_fence");
    PMIX_INFO_FREE(info, 1);

    if (!roots_only) {
        start = local_rank * (size / local_size);
        end = start + (size / local_size);
        if (local_rank == local_size - 1)
            end += size % local_size;
        for (i = start; i < end; i++) {
            PMIX_PROC_CONSTRUCT(&proc);
            MPL_strncpy(proc.nspace, MPIR_Process.pmix_proc.nspace, PMIX_MAX_NSLEN);
            proc.rank = i;
            rc = PMIx_Get(&proc, "bc", NULL, 0, &pvalue);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_get");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
            PMIX_VALUE_RELEASE(pvalue);
        }
    } else {
        int num_nodes, *node_roots;
        MPIR_NODEMAP_get_node_roots(nodemap, size, &node_roots, &num_nodes);

        start = local_rank * (num_nodes / local_size);
        end = start + (num_nodes / local_size);
        if (local_rank == local_size - 1)
            end += num_nodes % local_size;
        for (i = start; i < end; i++) {
            PMIX_PROC_CONSTRUCT(&proc);
            MPL_strncpy(proc.nspace, MPIR_Process.pmix_proc.nspace, PMIX_MAX_NSLEN);
            proc.rank = i;
            rc = PMIx_Get(&proc, "bc", NULL, 0, &pvalue);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**pmix_get");
            rc = MPL_str_get_binary_arg(val, "mpi", &segment[i * bc_len], bc_len, &out_len);
            MPIR_ERR_CHKANDJUMP(rc, mpi_errno, MPI_ERR_OTHER, "**argstr_missinghost");
            PMIX_VALUE_RELEASE(pvalue);
        }
    }
    mpi_errno = MPIDU_shm_barrier(barrier, local_size);
    if (mpi_errno)
        MPIR_ERR_POP(mpi_errno);

  single:
    if (!same_len) {
        indices = MPL_malloc(size * sizeof(size_t), MPL_MEM_ADDRESS);
        for (i = 0; i < size; i++)
            indices[i] = bc_len * i;
        *bc_indices = indices;
    }

  fn_exit:
    MPL_free(val);
    *bc_table = segment;

    return mpi_errno;
  fn_fail:
    goto fn_exit;
}
示例#24
0
static pmix_status_t create_cred(struct pmix_peer_t *peer,
                                 const pmix_info_t directives[], size_t ndirs,
                                 pmix_info_t **info, size_t *ninfo,
                                 pmix_byte_object_t *cred)
{
    pmix_peer_t *pr = (pmix_peer_t*)peer;
    char **types;
    size_t n, m;
    bool takeus;
    uid_t euid;
    gid_t egid;
    char *tmp, *ptr;

    /* ensure initialization */
    PMIX_BYTE_OBJECT_CONSTRUCT(cred);

    /* we may be responding to a local request for a credential, so
     * see if they specified a mechanism */
    if (NULL != directives && 0 < ndirs) {
        /* cycle across the provided info and see if they specified
         * any desired credential types */
        takeus = true;
        for (n=0; n < ndirs; n++) {
            if (0 == strncmp(directives[n].key, PMIX_CRED_TYPE, PMIX_MAX_KEYLEN)) {
                /* see if we are included */
                types = pmix_argv_split(directives[n].value.data.string, ',');
                /* start by assuming they don't want us */
                takeus = false;
                for (m=0; NULL != types[m]; m++) {
                    if (0 == strcmp(types[m], "native")) {
                        /* it's us! */
                        takeus = true;
                        break;
                    }
                }
                pmix_argv_free(types);
                break;
            }
        }
        if (!takeus) {
            PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED);
            return PMIX_ERR_NOT_SUPPORTED;
        }
    }

    if (PMIX_PROTOCOL_V1 == pr->protocol) {
        /* usock protocol - nothing to do */
        goto complete;
    } else if (PMIX_PROTOCOL_V2 == pr->protocol) {
        /* tcp protocol - need to provide our effective
         * uid and gid for validation on remote end */
        tmp = (char*)malloc(sizeof(uid_t) + sizeof(gid_t));
        if (NULL == tmp) {
            return PMIX_ERR_NOMEM;
        }
        euid = geteuid();
        memcpy(tmp, &euid, sizeof(uid_t));
        ptr = tmp + sizeof(uid_t);
        egid = getegid();
        memcpy(ptr, &egid, sizeof(gid_t));
        cred->bytes = tmp;
        cred->size = sizeof(uid_t) + sizeof(gid_t);
        goto complete;
    } else {
        /* unrecognized protocol */
        PMIX_ERROR_LOG(PMIX_ERR_NOT_SUPPORTED);
        return PMIX_ERR_NOT_SUPPORTED;
    }

  complete:
    if (NULL != info) {
        /* mark that this came from us */
        PMIX_INFO_CREATE(*info, 1);
        if (NULL == *info) {
            return PMIX_ERR_NOMEM;
        }
        *ninfo = 1;
        PMIX_INFO_LOAD(info[0], PMIX_CRED_TYPE, "native", PMIX_STRING);
    }
    return PMIX_SUCCESS;
}
示例#25
0
文件: pmi2.c 项目: thananon/ompi
PMIX_EXPORT int PMI2_Info_GetJobAttr(const char name[], char value[], int valuelen, int *found)
{
    pmix_status_t rc = PMIX_SUCCESS;
    pmix_value_t *val;
    pmix_info_t info[1];
    bool  val_optinal = 1;
    pmix_proc_t proc = myproc;
    proc.rank = PMIX_RANK_UNDEF;

    PMI2_CHECK();

    if ((NULL == name) || (NULL == value) || (NULL == found)) {
        return PMI2_ERR_INVALID_ARG;
    }

    if (pmi2_singleton) {
        return PMI2_FAIL;
    }

    /* set controlling parameters
     * PMIX_OPTIONAL - expect that these keys should be available on startup
     */
    PMIX_INFO_CONSTRUCT(&info[0]);
    PMIX_INFO_LOAD(&info[0], PMIX_OPTIONAL, &val_optinal, PMIX_BOOL);

    /* PMI-2 expects resource manager to set
     * process mapping in ANL notation. */
    if (!strcmp(name, ANL_MAPPING)) {
        /* we are looking in the job-data. If there is nothing there
         * we don't want to look in rank's data, thus set rank to widcard */
        proc = myproc;
        proc.rank = PMIX_RANK_WILDCARD;
        if (PMIX_SUCCESS == PMIx_Get(&proc, PMIX_ANL_MAP, NULL, 0, &val) &&
               (NULL != val) && (PMIX_STRING == val->type)) {
            strncpy(value, val->data.string, valuelen);
            PMIX_VALUE_FREE(val, 1);
            *found = 1;
            return PMI2_SUCCESS;
        } else {
            /* artpol:
             * Some RM's (i.e. SLURM) already have ANL precomputed. The export it
             * through PMIX_ANL_MAP variable.
             * If we haven't found it we want to have our own packing functionality
             * since it's common.
             * Somebody else has to write it since I've already done that for
             * GPL'ed SLURM :) */
            *found = 1;
            return PMI2_FAIL;
        }
    }


    *found = 0;
    rc = PMIx_Get(&proc, name, info, 1, &val);
    if (PMIX_SUCCESS == rc && NULL != val) {
        if (PMIX_STRING != val->type) {
            rc = PMIX_ERROR;
        } else if (NULL != val->data.string) {
            (void)strncpy(value, val->data.string, valuelen);
            *found = 1;
        }
        PMIX_VALUE_RELEASE(val);
    } else if (PMIX_ERR_NOT_FOUND == rc) {
        rc = PMIX_SUCCESS;
    }

    PMIX_INFO_DESTRUCT(&info[0]);

    return convert_err(rc);
}
示例#26
0
文件: debuggerd.c 项目: bosilca/ompi
int main(int argc, char **argv)
{
    pmix_status_t rc;
    pmix_value_t *val;
    pmix_proc_t proc;
    pmix_info_t *info;
    size_t ninfo;
    volatile int active;
    pmix_query_t *query;
    size_t nq, n;
    myquery_data_t myquery_data;

fprintf(stderr, "I AM HERE\n");
fflush(stderr);
    sleep(10);
    exit(0);

    /* init us - since we were launched by the RM, our connection info
     * will have been provided at startup. */
    if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
        fprintf(stderr, "Debugger daemon ns %s rank %d: PMIx_tool_init failed: %d\n", myproc.nspace, myproc.rank, rc);
        exit(0);
    }
    fprintf(stderr, "Debugger daemon ns %s rank %d: Running\n", myproc.nspace, myproc.rank);


    /* register our default event handler */
    active = -1;
    PMIx_Register_event_handler(NULL, 0, NULL, 0,
                                notification_fn, evhandler_reg_callbk, (void*)&active);
    while (-1 == active) {
        usleep(10);
    }
    if (0 != active) {
        exit(active);
    }

    /* get the nspace of the job we are to debug */
    (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
        fprintf(stderr, "[%s:%d] Failed to get job being debugged - error %d\n", myproc.nspace, myproc.rank, rc);
        goto done;
    }
    if (NULL == val) {
        fprintf(stderr, "Got NULL return\n");
        goto done;
    }
    fprintf(stderr, "[%s:%d] Debugging %s\n", myproc.nspace, myproc.rank, val->data.string);

    /* get our local proctable - for scalability reasons, we don't want to
     * have our "root" debugger process get the proctable for everybody and
     * send it out to us. So ask the local PMIx server for the pid's of
     * our local target processes */
    nq = 1;
    PMIX_QUERY_CREATE(query, nq);
    PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE);
    query[0].nqual = 1;
    PMIX_INFO_CREATE(query[0].qualifiers, 1);
    PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, val->data.string, PMIX_STRING);  // the nspace we are enquiring about
    /* setup the caddy to retrieve the data */
    myquery_data.info = NULL;
    myquery_data.ninfo = 0;
    myquery_data.active = true;
    /* execute the query */
    if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
        fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
        goto done;
    }
    while (myquery_data.active) {
        usleep(10);
    }
    fprintf(stderr, "[%s:%d] Local proctable received\n", myproc.nspace, myproc.rank);


    /* now that we have the proctable for our local processes, we can do our
     * magic debugger stuff and attach to them. We then send a "release" event
     * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We
     * do this with the event notification system */
    (void)strncpy(proc.nspace, val->data.string, PMIX_MAX_NSLEN);
    proc.rank = PMIX_RANK_WILDCARD;
    /* we send the notification to just the local procs of the job being debugged */
    ninfo = 1;
    PMIX_INFO_CREATE(info, ninfo);
    PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC);  // deliver to the target nspace
    fprintf(stderr, "[%s:%u] Sending release\n", myproc.nspace, myproc.rank);
    PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
                      NULL, PMIX_RANGE_LOCAL,
                      info, ninfo, NULL, NULL);

    /* do some debugger magic */
    n = 0;
    fprintf(stderr, "[%s:%u] Hanging around awhile, doing debugger magic\n", myproc.nspace, myproc.rank);
    while (n < 5) {
        usleep(1000);
        ++n;
    }

  done:
    /* finalize us */
    fprintf(stderr, "Debugger daemon ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
    if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
        fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
    } else {
        fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
    }
    fflush(stderr);
    return(0);
}
示例#27
0
文件: pmix_log.c 项目: bosilca/ompi
PMIX_EXPORT pmix_status_t PMIx_Log_nb(const pmix_info_t data[], size_t ndata,
                                      const pmix_info_t directives[], size_t ndirs,
                                      pmix_op_cbfunc_t cbfunc, void *cbdata)

{
    pmix_shift_caddy_t *cd;
    pmix_cmd_t cmd = PMIX_LOG_CMD;
    pmix_buffer_t *msg;
    pmix_status_t rc;
    size_t n;
    time_t timestamp = 0;
    pmix_proc_t *source = NULL;

    PMIX_ACQUIRE_THREAD(&pmix_global_lock);

    pmix_output_verbose(2, pmix_globals.debug_output,
                        "pmix:log non-blocking");

    if (pmix_globals.init_cntr <= 0) {
        PMIX_RELEASE_THREAD(&pmix_global_lock);
        return PMIX_ERR_INIT;
    }

    if (0 == ndata || NULL == data) {
        PMIX_RELEASE_THREAD(&pmix_global_lock);
        return PMIX_ERR_BAD_PARAM;
    }

    /* check the directives - if they requested a timestamp, then
     * get the time, also look for a source */
    if (NULL != directives) {
        for (n=0; n < ndirs; n++) {
            if (0 == strncmp(directives[n].key, PMIX_LOG_GENERATE_TIMESTAMP, PMIX_MAX_KEYLEN)) {
                if (PMIX_INFO_TRUE(&directives[n])) {
                    /* pickup the timestamp */
                    timestamp = time(NULL);
                }
            } else if (0 == strncmp(directives[n].key, PMIX_LOG_SOURCE, PMIX_MAX_KEYLEN)) {
                source = directives[n].value.data.proc;
            }
        }
    }

    /* if we are a client or tool, we never do this ourselves - we
     * always pass this request to our server for execution */
    if (!PMIX_PROC_IS_SERVER(pmix_globals.mypeer) &&
        !PMIX_PROC_IS_LAUNCHER(pmix_globals.mypeer)) {
        /* if we aren't connected, don't attempt to send */
        if (!pmix_globals.connected) {
            PMIX_RELEASE_THREAD(&pmix_global_lock);
            return PMIX_ERR_UNREACH;
        }
        PMIX_RELEASE_THREAD(&pmix_global_lock);

        /* if we are not a server, then relay this request to the server */
        cd = PMIX_NEW(pmix_shift_caddy_t);
        cd->cbfunc.opcbfn = cbfunc;
        cd->cbdata = cbdata;
        msg = PMIX_NEW(pmix_buffer_t);
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &cmd, 1, PMIX_COMMAND);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        /* provide the timestamp - zero will indicate
         * that it wasn't taken */
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &timestamp, 1, PMIX_TIME);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        /* pack the number of data entries */
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &ndata, 1, PMIX_SIZE);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        if (0 < ndata) {
            PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                             msg, data, ndata, PMIX_INFO);
            if (PMIX_SUCCESS != rc) {
                PMIX_ERROR_LOG(rc);
                PMIX_RELEASE(msg);
                PMIX_RELEASE(cd);
                return rc;
            }
        }
        PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                         msg, &ndirs, 1, PMIX_SIZE);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(msg);
            PMIX_RELEASE(cd);
            return rc;
        }
        if (0 < ndirs) {
            PMIX_BFROPS_PACK(rc, pmix_client_globals.myserver,
                             msg, directives, ndirs, PMIX_INFO);
            if (PMIX_SUCCESS != rc) {
                PMIX_ERROR_LOG(rc);
                PMIX_RELEASE(msg);
                PMIX_RELEASE(cd);
                return rc;
            }
        }

        pmix_output_verbose(2, pmix_plog_base_framework.framework_output,
                            "pmix:log sending to server");
        PMIX_PTL_SEND_RECV(rc, pmix_client_globals.myserver,
                           msg, log_cbfunc, (void*)cd);
        if (PMIX_SUCCESS != rc) {
            PMIX_ERROR_LOG(rc);
            PMIX_RELEASE(cd);
        }
        return rc;
    }
    PMIX_RELEASE_THREAD(&pmix_global_lock);

    /* if no recorded source was found, then we must be it */
    if (NULL == source) {
        source = &pmix_globals.myid;
        cd = PMIX_NEW(pmix_shift_caddy_t);
        cd->cbfunc.opcbfn = cbfunc;
        cd->cbdata = cbdata;
        cd->ndirs = ndirs + 1;
        PMIX_INFO_CREATE(cd->directives, cd->ndirs);
        for (n=0; n < ndirs; n++) {
            PMIX_INFO_XFER(&cd->directives[n], (pmix_info_t*)&directives[n]);
        }
        PMIX_INFO_LOAD(&cd->directives[ndirs], PMIX_LOG_SOURCE, &source, PMIX_PROC);
        /* call down to process the request - the various components
         * will thread shift as required */
        rc = pmix_plog.log(source, data, ndata, cd->directives, cd->ndirs, localcbfunc, cd);
        if (PMIX_SUCCESS != rc) {
            PMIX_INFO_FREE(cd->directives, cd->ndirs);
            PMIX_RELEASE(cd);
        }
    } else if (0 == strncmp(source->nspace, pmix_globals.myid.nspace, PMIX_MAX_NSLEN) &&
               source->rank == pmix_globals.myid.rank) {
        /* if I am the recorded source, then this is a re-submission of
         * something that got "upcalled" by a prior call. In this case,
         * we return a "not supported" error as clearly we couldn't
         * handle it, and neither could our host */
        rc = PMIX_ERR_NOT_SUPPORTED;
    } else {
        /* call down to process the request - the various components
         * will thread shift as required */
        rc = pmix_plog.log(source, data, ndata, directives, ndirs, cbfunc, cbdata);
    }

    return rc;
}