pmix_status_t pmix_bfrop_print_status(char **output, char *prefix, pmix_status_t *src, pmix_data_type_t type) { char *prefx; /* deal with NULL prefix */ if (NULL == prefix) { if (0 > asprintf(&prefx, " ")) { return PMIX_ERR_NOMEM; } } else { prefx = prefix; } /* if src is NULL, just print data type and return */ if (NULL == src) { if (0 > asprintf(output, "%sData type: PMIX_STATUS\tValue: NULL pointer", prefx)) { return PMIX_ERR_NOMEM; } if (prefx != prefix) { free(prefx); } return PMIX_SUCCESS; } if (0 > asprintf(output, "%sData type: PMIX_STATUS\tValue: %s", prefx, PMIx_Error_string(*src))) { return PMIX_ERR_NOMEM; } if (prefx != prefix) { free(prefx); } return PMIX_SUCCESS; }
/* this is an event notification function that we explicitly request * be called when the PMIX_MODEL_DECLARED notification is issued. * We could catch it in the general event notification function and test * the status to see if the status matched, but it often is simpler * to declare a use-specific notification callback point. In this case, * we are asking to know whenever a model is declared as a means * of testing server self-notification */ static void model_callback(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, pmix_info_t info[], size_t ninfo, pmix_info_t results[], size_t nresults, pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata) { size_t n; /* just let us know it was received */ fprintf(stderr, "%s:%d Model event handler called with status %d(%s)\n", myproc.nspace, myproc.rank, status, PMIx_Error_string(status)); for (n=0; n < ninfo; n++) { if (PMIX_STRING == info[n].value.type) { fprintf(stderr, "%s:%d\t%s:\t%s\n", myproc.nspace, myproc.rank, info[n].key, info[n].value.data.string); } } /* we must NOT tell the event handler state machine that we * are the last step as that will prevent it from notifying * anyone else that might be listening for declarations */ if (NULL != cbfunc) { cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata); } }
static pmix_status_t spawn_debugger(char *appspace, myrel_t *myrel) { pmix_status_t rc; pmix_info_t *dinfo; pmix_app_t *debugger; size_t dninfo; char cwd[1024]; char dspace[PMIX_MAX_NSLEN+1]; mylock_t mylock; pmix_status_t code = PMIX_ERR_JOB_TERMINATED; /* setup the debugger */ PMIX_APP_CREATE(debugger, 1); debugger[0].cmd = strdup("./debuggerd"); PMIX_ARGV_APPEND(rc, debugger[0].argv, "./debuggerd"); getcwd(cwd, 1024); // point us to our current directory debugger[0].cwd = strdup(cwd); /* provide directives so the daemons go where we want, and * let the RM know these are debugger daemons */ dninfo = 6; PMIX_INFO_CREATE(dinfo, dninfo); PMIX_INFO_LOAD(&dinfo[0], PMIX_MAPBY, "ppr:1:node", PMIX_STRING); // instruct the RM to launch one copy of the executable on each node PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUGGER_DAEMONS, NULL, PMIX_BOOL); // these are debugger daemons PMIX_INFO_LOAD(&dinfo[1], PMIX_DEBUG_JOB, appspace, PMIX_STRING); // the nspace being debugged PMIX_INFO_LOAD(&dinfo[2], PMIX_NOTIFY_COMPLETION, NULL, PMIX_BOOL); // notify us when the debugger job completes PMIX_INFO_LOAD(&dinfo[3], PMIX_DEBUG_WAITING_FOR_NOTIFY, NULL, PMIX_BOOL); // tell the daemon that the proc is waiting to be released PMIX_INFO_LOAD(&dinfo[4], PMIX_FWD_STDOUT, NULL, PMIX_BOOL); // forward stdout to me PMIX_INFO_LOAD(&dinfo[5], PMIX_FWD_STDERR, NULL, PMIX_BOOL); // forward stderr to me /* spawn the daemons */ fprintf(stderr, "Debugger: spawning %s\n", debugger[0].cmd); if (PMIX_SUCCESS != (rc = PMIx_Spawn(dinfo, dninfo, debugger, 1, dspace))) { fprintf(stderr, "Debugger daemons failed to launch with error: %s\n", PMIx_Error_string(rc)); PMIX_INFO_FREE(dinfo, dninfo); PMIX_APP_FREE(debugger, 1); return rc; } /* cleanup */ PMIX_INFO_FREE(dinfo, dninfo); PMIX_APP_FREE(debugger, 1); /* register callback for when this job terminates */ myrel->nspace = strdup(dspace); PMIX_INFO_CREATE(dinfo, 2); PMIX_INFO_LOAD(&dinfo[0], PMIX_EVENT_RETURN_OBJECT, myrel, PMIX_POINTER); /* only call me back when this specific job terminates */ PMIX_INFO_LOAD(&dinfo[1], PMIX_NSPACE, dspace, PMIX_STRING); DEBUG_CONSTRUCT_LOCK(&mylock); PMIx_Register_event_handler(&code, 1, dinfo, 2, release_fn, evhandler_reg_callbk, (void*)&mylock); DEBUG_WAIT_THREAD(&mylock); rc = mylock.status; DEBUG_DESTRUCT_LOCK(&mylock); PMIX_INFO_FREE(dinfo, 2); return rc; }
int main(int argc, char **argv) { pmix_status_t rc; pmix_proc_t myproc; pmix_info_t *info; size_t ninfo; /* init us */ if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) { fprintf(stderr, "PMIx_tool_init failed: %s\n", PMIx_Error_string(rc)); exit(rc); } fprintf(stderr, "Tool ns %s rank %d: Running\n", myproc.nspace, myproc.rank); /* query something */ ninfo = 1; PMIX_INFO_CREATE(info, ninfo); (void)strncpy(info[0].key, PMIX_QUERY_NAMESPACES, PMIX_MAX_KEYLEN); if (PMIX_SUCCESS != (rc = PMIx_Query_info(info, ninfo))) { fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info failed: %d\n", myproc.nspace, myproc.rank, rc); goto done; } if (0 != strncmp(info[0].key, PMIX_QUERY_NAMESPACES, PMIX_MAX_KEYLEN)) { fprintf(stderr, "tool ns %s rank %d: PMIx_Query_info key[0] wrong: %s vs %s\n", myproc.nspace, myproc.rank, info[0].key, PMIX_QUERY_NAMESPACES); } if (PMIX_STRING != info[0].value.type) { fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info key[0] wrong type: %d vs %d\n", myproc.nspace, myproc.rank, info[0].value.type, PMIX_STRING); } fprintf(stderr, "Tool ns %s rank %d: PMIx_Query_info key[0] returned %s\n", myproc.nspace, myproc.rank, (NULL == info[0].value.data.string) ? "NULL" : info[0].value.data.string); PMIX_INFO_FREE(info, ninfo); done: /* finalize us */ fprintf(stderr, "Tool ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank); if (PMIX_SUCCESS != (rc = PMIx_tool_finalize())) { fprintf(stderr, "Tool ns %s rank %d:PMIx_tool_finalize failed: %d\n", myproc.nspace, myproc.rank, rc); } else { fprintf(stderr, "Tool ns %s rank %d:PMIx_tool_finalize successfully completed\n", myproc.nspace, myproc.rank); } fflush(stderr); return(rc); }
static void notification_fn(size_t evhdlr_registration_id, pmix_status_t status, const pmix_proc_t *source, pmix_info_t info[], size_t ninfo, pmix_info_t results[], size_t nresults, pmix_event_notification_cbfunc_fn_t cbfunc, void *cbdata) { pmix_output(0, "Client %s:%d NOTIFIED with status %s", myproc.nspace, myproc.rank, PMIx_Error_string(status)); if (NULL != cbfunc) { cbfunc(PMIX_SUCCESS, NULL, 0, NULL, NULL, cbdata); } completed = true; }
int main(int argc, char **argv) { int rc; pmix_value_t value; pmix_value_t *val = &value; char *tmp; pmix_proc_t proc; uint32_t nprocs, n; int cnt, j; bool doabort = false; volatile bool active; pmix_info_t info, *iptr; size_t ninfo; pmix_status_t code; if (1 < argc) { if (0 == strcmp("-abort", argv[1])) { doabort = true; } } /* init us and declare we are a test programming model */ PMIX_INFO_CREATE(iptr, 2); PMIX_INFO_LOAD(&iptr[0], PMIX_PROGRAMMING_MODEL, "TEST", PMIX_STRING); PMIX_INFO_LOAD(&iptr[1], PMIX_MODEL_LIBRARY_NAME, "PMIX", PMIX_STRING); if (PMIX_SUCCESS != (rc = PMIx_Init(&myproc, iptr, 2))) { pmix_output(0, "Client ns %s rank %d: PMIx_Init failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); exit(rc); } PMIX_INFO_FREE(iptr, 2); pmix_output(0, "Client ns %s rank %d: Running", myproc.nspace, myproc.rank); /* test something */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_JOB_SIZE, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); exit(rc); } PMIX_VALUE_RELEASE(val); /* test something */ if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_SERVER_URI, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d: PMIx_Get failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); exit(rc); } pmix_output(0, "CLIENT SERVER URI: %s", val->data.string); PMIX_VALUE_RELEASE(val); /* register a handler specifically for when models declare */ active = true; ninfo = 1; PMIX_INFO_CREATE(iptr, ninfo); PMIX_INFO_LOAD(&iptr[0], PMIX_EVENT_HDLR_NAME, "SIMPCLIENT-MODEL", PMIX_STRING); code = PMIX_MODEL_DECLARED; PMIx_Register_event_handler(&code, 1, iptr, ninfo, model_callback, model_registration_callback, (void*)&active); while (active) { usleep(10); } PMIX_INFO_FREE(iptr, ninfo); /* register our errhandler */ active = true; PMIx_Register_event_handler(NULL, 0, NULL, 0, notification_fn, errhandler_reg_callbk, (void*)&active); while (active) { usleep(10); } /* get our universe size */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_UNIV_SIZE, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d: PMIx_Get universe size failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } nprocs = val->data.uint32; PMIX_VALUE_RELEASE(val); pmix_output(0, "Client %s:%d universe size %d", myproc.nspace, myproc.rank, nprocs); /* put a few values */ (void)asprintf(&tmp, "%s-%d-internal", myproc.nspace, myproc.rank); value.type = PMIX_UINT32; value.data.uint32 = 1234; if (PMIX_SUCCESS != (rc = PMIx_Store_internal(&myproc, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Store_internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } for (cnt=0; cnt < MAXCNT; cnt++) { (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, myproc.rank, cnt); value.type = PMIX_UINT64; value.data.uint64 = 1234; if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_LOCAL, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } (void)asprintf(&tmp, "%s-%d-remote-%d", myproc.nspace, myproc.rank, cnt); value.type = PMIX_STRING; value.data.string = "1234"; if (PMIX_SUCCESS != (rc = PMIx_Put(PMIX_REMOTE, tmp, &value))) { pmix_output(0, "Client ns %s rank %d: PMIx_Put internal failed: %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); goto done; } if (PMIX_SUCCESS != (rc = PMIx_Commit())) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Commit failed: %s", myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc)); goto done; } /* call fence to ensure the data is received */ PMIX_PROC_CONSTRUCT(&proc); (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; if (PMIX_SUCCESS != (rc = PMIx_Fence(&proc, 1, NULL, 0))) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Fence failed: %s", myproc.nspace, myproc.rank, cnt, PMIx_Error_string(rc)); goto done; } /* check the returned data */ (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN); for (j=0; j <= cnt; j++) { for (n=0; n < nprocs; n++) { proc.rank = n; (void)asprintf(&tmp, "%s-%d-local-%d", myproc.nspace, n, j); if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s failed: %s", myproc.nspace, myproc.rank, j, tmp, PMIx_Error_string(rc)); continue; } if (NULL == val) { pmix_output(0, "Client ns %s rank %d: NULL value returned", myproc.nspace, myproc.rank); break; } if (PMIX_UINT64 != val->type) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong type: %d", myproc.nspace, myproc.rank, j, tmp, val->type); PMIX_VALUE_RELEASE(val); free(tmp); continue; } if (1234 != val->data.uint64) { pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned wrong value: %d", myproc.nspace, myproc.rank, j, tmp, (int)val->data.uint64); PMIX_VALUE_RELEASE(val); free(tmp); continue; } pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); PMIX_VALUE_RELEASE(val); free(tmp); if (n != myproc.rank) { (void)asprintf(&tmp, "%s-%d-remote-%d", proc.nspace, n, j); if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, tmp, NULL, 0, &val))) { /* this data should _not_ be found as we are on the same node * and the data was "put" with a PMIX_REMOTE scope */ pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned correct", myproc.nspace, myproc.rank, j, tmp); continue; } pmix_output(0, "Client ns %s rank %d cnt %d: PMIx_Get %s returned remote data for a local proc", myproc.nspace, myproc.rank, j, tmp); PMIX_VALUE_RELEASE(val); free(tmp); } } } } /* now get the data blob for myself */ pmix_output(0, "Client ns %s rank %d testing internal modex blob", myproc.nspace, myproc.rank); if (PMIX_SUCCESS == (rc = PMIx_Get(&myproc, NULL, NULL, 0, &val))) { if (PMIX_DATA_ARRAY != val->type) { pmix_output(0, "Client ns %s rank %d did not return an array for its internal modex blob", myproc.nspace, myproc.rank); PMIX_VALUE_RELEASE(val); } else if (PMIX_INFO != val->data.darray->type) { pmix_output(0, "Client ns %s rank %d returned an internal modex array of type %s instead of PMIX_INFO", myproc.nspace, myproc.rank, PMIx_Data_type_string(val->data.darray->type)); PMIX_VALUE_RELEASE(val); } else if (0 == val->data.darray->size) { pmix_output(0, "Client ns %s rank %d returned an internal modex array of zero length", myproc.nspace, myproc.rank); PMIX_VALUE_RELEASE(val); } else { pmix_info_t *iptr = (pmix_info_t*)val->data.darray->array; for (n=0; n < val->data.darray->size; n++) { pmix_output(0, "\tKey: %s", iptr[n].key); } PMIX_VALUE_RELEASE(val); } } else { pmix_output(0, "Client ns %s rank %d internal modex blob FAILED with error %s(%d)", myproc.nspace, myproc.rank, PMIx_Error_string(rc), rc); } /* log something */ PMIX_INFO_CONSTRUCT(&info); PMIX_INFO_LOAD(&info, PMIX_LOG_STDERR, "test log msg", PMIX_STRING); active = true; rc = PMIx_Log_nb(&info, 1, NULL, 0, opcbfunc, (void*)&active); if (PMIX_SUCCESS != rc) { pmix_output(0, "Client ns %s rank %d - log_nb returned %s", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); } else { while (active) { usleep(10); } } PMIX_INFO_DESTRUCT(&info); /* if requested and our rank is 0, call abort */ if (doabort) { if (0 == myproc.rank) { PMIx_Abort(PMIX_ERR_PROC_REQUESTED_ABORT, "CALLING ABORT", NULL, 0); } else { while(!completed) { usleep(10); } } } done: /* finalize us */ pmix_output(0, "Client ns %s rank %d: Finalizing", myproc.nspace, myproc.rank); if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize failed: %s\n", myproc.nspace, myproc.rank, PMIx_Error_string(rc)); } else { fprintf(stderr, "Client ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank); } fflush(stderr); return(rc); }
int main(int argc, char **argv) { pmix_status_t rc; pmix_info_t *info, *iptr; pmix_app_t *app; size_t ninfo, napps; char *nspace = NULL; int i; pmix_query_t *query; size_t nq, n; myquery_data_t myquery_data; bool cospawn = false, stop_on_exec = false, cospawn_reqd = false; char cwd[1024]; pmix_status_t code = PMIX_ERR_JOB_TERMINATED; mylock_t mylock; myrel_t myrel, launcher_ready, dbrel; pid_t pid; pmix_envar_t envar; char *launchers[] = { "prun", "mpirun", "mpiexec", "orterun", NULL }; pmix_proc_t proc; bool found; pmix_data_array_t darray; char *tmp; char clientspace[PMIX_MAX_NSLEN+1]; pid = getpid(); /* Process any arguments we were given */ for (i=1; i < argc; i++) { if (0 == strcmp(argv[i], "-h") || 0 == strcmp(argv[i], "--help")) { /* print the usage message and exit */ } if (0 == strcmp(argv[i], "-a") || 0 == strcmp(argv[i], "--attach")) { if (NULL != nspace) { /* can only support one */ fprintf(stderr, "Cannot attach to more than one nspace\n"); exit(1); } /* the next argument must be the nspace */ ++i; if (argc == i) { /* they goofed */ fprintf(stderr, "The %s option requires an <nspace> argument\n", argv[i]); exit(1); } nspace = strdup(argv[i]); } else if (0 == strcmp(argv[i], "-c") || 0 == strcmp(argv[i], "--cospawn")){ cospawn_reqd = true; break; } } info = NULL; ninfo = 0; /* use the system connection first, if available */ PMIX_INFO_CREATE(info, 1); PMIX_INFO_LOAD(&info[0], PMIX_CONNECT_SYSTEM_FIRST, NULL, PMIX_BOOL); /* init as a tool */ if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, info, ninfo))) { fprintf(stderr, "PMIx_tool_init failed: %s(%d)\n", PMIx_Error_string(rc), rc); exit(rc); } PMIX_INFO_FREE(info, ninfo); fprintf(stderr, "Debugger ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid); /* construct the debugger termination release */ DEBUG_CONSTRUCT_LOCK(&dbrel.lock); /* register a default event handler */ DEBUG_CONSTRUCT_LOCK(&mylock); PMIx_Register_event_handler(NULL, 0, NULL, 0, notification_fn, evhandler_reg_callbk, (void*)&mylock); DEBUG_WAIT_THREAD(&mylock); DEBUG_DESTRUCT_LOCK(&mylock); /* if we are attaching to a running job, then attach to it */ if (NULL != nspace) { if (PMIX_SUCCESS != (rc = attach_to_running_job(nspace))) { fprintf(stderr, "Failed to attach to nspace %s: error code %d\n", nspace, rc); goto done; } } done: DEBUG_DESTRUCT_LOCK(&myrel.lock); DEBUG_DESTRUCT_LOCK(&dbrel.lock); PMIx_tool_finalize(); return(rc); }
/* * PMIX_VALUE */ pmix_status_t pmix_bfrop_print_value(char **output, char *prefix, pmix_value_t *src, pmix_data_type_t type) { char *prefx; int rc; /* deal with NULL prefix */ if (NULL == prefix) { if (0 > asprintf(&prefx, " ")) { return PMIX_ERR_NOMEM; } } else { prefx = prefix; } /* if src is NULL, just print data type and return */ if (NULL == src) { if (0 > asprintf(output, "%sData type: PMIX_VALUE\tValue: NULL pointer", prefx)) { return PMIX_ERR_NOMEM; } if (prefx != prefix) { free(prefx); } return PMIX_SUCCESS; } switch (src->type) { case PMIX_BYTE: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_BYTE\tValue: %x", prefx, src->data.byte); break; case PMIX_STRING: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_STRING\tValue: %s", prefx, src->data.string); break; case PMIX_SIZE: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_SIZE\tValue: %lu", prefx, (unsigned long)src->data.size); break; case PMIX_PID: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_PID\tValue: %lu", prefx, (unsigned long)src->data.pid); break; case PMIX_INT: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_INT\tValue: %d", prefx, src->data.integer); break; case PMIX_INT8: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_INT8\tValue: %d", prefx, (int)src->data.int8); break; case PMIX_INT16: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_INT16\tValue: %d", prefx, (int)src->data.int16); break; case PMIX_INT32: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_INT32\tValue: %d", prefx, src->data.int32); break; case PMIX_INT64: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_INT64\tValue: %ld", prefx, (long)src->data.int64); break; case PMIX_UINT: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_UINT\tValue: %u", prefx, src->data.uint); break; case PMIX_UINT8: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_UINT8\tValue: %u", prefx, (unsigned int)src->data.uint8); break; case PMIX_UINT16: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_UINT16\tValue: %u", prefx, (unsigned int)src->data.uint16); break; case PMIX_UINT32: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_UINT32\tValue: %u", prefx, src->data.uint32); break; case PMIX_UINT64: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_UINT64\tValue: %lu", prefx, (unsigned long)src->data.uint64); break; case PMIX_FLOAT: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_FLOAT\tValue: %f", prefx, src->data.fval); break; case PMIX_DOUBLE: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_DOUBLE\tValue: %f", prefx, src->data.dval); break; case PMIX_TIMEVAL: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_TIMEVAL\tValue: %ld.%06ld", prefx, (long)src->data.tv.tv_sec, (long)src->data.tv.tv_usec); break; case PMIX_STATUS: rc = asprintf(output, "%sPMIX_VALUE: Data type: PMIX_STATUS\tValue: %s", prefx, PMIx_Error_string(src->data.status)); break; default: rc = asprintf(output, "%sPMIX_VALUE: Data type: UNKNOWN\tValue: UNPRINTABLE", prefx); break; } if (prefx != prefix) { free(prefx); } if (0 > rc) { return PMIX_ERR_NOMEM; } return PMIX_SUCCESS; }
int main(int argc, char **argv) { pmix_status_t rc; pmix_value_t *val; pmix_proc_t proc; pmix_info_t *info; size_t ninfo; pmix_query_t *query; size_t nq, n; myquery_data_t myquery_data; pid_t pid; pmix_status_t code = PMIX_ERR_JOB_TERMINATED; mylock_t mylock; myrel_t myrel; uint16_t localrank; char *target = NULL; pid = getpid(); /* init us - since we were launched by the RM, our connection info * will have been provided at startup. */ if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) { fprintf(stderr, "Debugger daemon: PMIx_tool_init failed: %d\n", rc); exit(0); } fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Running\n", myproc.nspace, myproc.rank, (unsigned long)pid); /* register our default event handler */ DEBUG_CONSTRUCT_LOCK(&mylock); PMIx_Register_event_handler(NULL, 0, NULL, 0, notification_fn, evhandler_reg_callbk, (void*)&mylock); DEBUG_WAIT_THREAD(&mylock); if (PMIX_SUCCESS != mylock.status) { rc = mylock.status; DEBUG_DESTRUCT_LOCK(&mylock); goto done; } DEBUG_DESTRUCT_LOCK(&mylock); /* get the nspace of the job we are to debug - it will be in our JOB info */ #ifdef PMIX_LOAD_PROCID PMIX_LOAD_PROCID(&proc, myproc.nspace, PMIX_RANK_WILDCARD); #else PMIX_PROC_CONSTRUCT(&proc); (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_KEYLEN); proc.rank = PMIX_RANK_WILDCARD; #endif if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) { fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - error %s\n", myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Error_string(rc)); goto done; } if (NULL == val || PMIX_STRING != val->type || NULL == val->data.string) { fprintf(stderr, "[%s:%d:%lu] Failed to get job being debugged - NULL data returned\n", myproc.nspace, myproc.rank, (unsigned long)pid); goto done; } /* save it for later */ target = strdup(val->data.string); PMIX_VALUE_RELEASE(val); fprintf(stderr, "[%s:%d:%lu] Debugging %s\n", myproc.nspace, myproc.rank, (unsigned long)pid, target); /* get my local rank so I can determine which local proc is "mine" * to debug */ val = NULL; if (PMIX_SUCCESS != (rc = PMIx_Get(&myproc, PMIX_LOCAL_RANK, NULL, 0, &val))) { fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - error %s\n", myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Error_string(rc)); goto done; } if (NULL == val) { fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - NULL data returned\n", myproc.nspace, myproc.rank, (unsigned long)pid); goto done; } if (PMIX_UINT16 != val->type) { fprintf(stderr, "[%s:%d:%lu] Failed to get my local rank - returned wrong type %s\n", myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Data_type_string(val->type)); goto done; } /* save the data */ localrank = val->data.uint16; PMIX_VALUE_RELEASE(val); fprintf(stderr, "[%s:%d:%lu] my local rank %d\n", myproc.nspace, myproc.rank, (unsigned long)pid, (int)localrank); /* register another handler specifically for when the target * job completes */ DEBUG_CONSTRUCT_LOCK(&myrel.lock); myrel.nspace = strdup(proc.nspace); PMIX_INFO_CREATE(info, 2); PMIX_INFO_LOAD(&info[0], PMIX_EVENT_RETURN_OBJECT, &myrel, PMIX_POINTER); /* only call me back when this specific job terminates */ PMIX_LOAD_PROCID(&proc, target, PMIX_RANK_WILDCARD); PMIX_INFO_LOAD(&info[1], PMIX_EVENT_AFFECTED_PROC, &proc, PMIX_PROC); fprintf(stderr, "[%s:%d:%lu] registering for termination of %s\n", myproc.nspace, myproc.rank, (unsigned long)pid, proc.nspace); DEBUG_CONSTRUCT_LOCK(&mylock); PMIx_Register_event_handler(&code, 1, info, 2, release_fn, evhandler_reg_callbk, (void*)&mylock); DEBUG_WAIT_THREAD(&mylock); if (PMIX_SUCCESS != mylock.status) { rc = mylock.status; DEBUG_DESTRUCT_LOCK(&mylock); PMIX_INFO_FREE(info, 2); goto done; } DEBUG_DESTRUCT_LOCK(&mylock); PMIX_INFO_FREE(info, 2); /* get our local proctable - for scalability reasons, we don't want to * have our "root" debugger process get the proctable for everybody and * send it out to us. So ask the local PMIx server for the pid's of * our local target processes */ nq = 1; PMIX_QUERY_CREATE(query, nq); PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE); query[0].nqual = 1; PMIX_INFO_CREATE(query[0].qualifiers, 1); PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, target, PMIX_STRING); // the nspace we are enquiring about /* setup the caddy to retrieve the data */ DEBUG_CONSTRUCT_LOCK(&myquery_data.lock); myquery_data.info = NULL; myquery_data.ninfo = 0; /* execute the query */ if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) { fprintf(stderr, "PMIx_Query_info failed: %d\n", rc); goto done; } DEBUG_WAIT_THREAD(&myquery_data.lock); DEBUG_DESTRUCT_LOCK(&myquery_data.lock); PMIX_QUERY_FREE(query, nq); if (PMIX_SUCCESS != myquery_data.status) { rc = myquery_data.status; goto done; } fprintf(stderr, "[%s:%d:%lu] Local proctable received\n", myproc.nspace, myproc.rank, (unsigned long)pid); /* now that we have the proctable for our local processes, we can do our * magic debugger stuff and attach to them. We then send a "release" event * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We * do this with the event notification system. For this example, we just * send it to all local procs of the job being debugged */ (void)strncpy(proc.nspace, target, PMIX_MAX_NSLEN); proc.rank = PMIX_RANK_WILDCARD; ninfo = 2; PMIX_INFO_CREATE(info, ninfo); PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC); // deliver to the target nspace PMIX_INFO_LOAD(&info[1], PMIX_EVENT_NON_DEFAULT, NULL, PMIX_BOOL); // deliver to the target nspace fprintf(stderr, "[%s:%u:%lu] Sending release\n", myproc.nspace, myproc.rank, (unsigned long)pid); rc = PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE, NULL, PMIX_RANGE_CUSTOM, info, ninfo, NULL, NULL); if (PMIX_SUCCESS != rc) { fprintf(stderr, "%s[%s:%u:%lu] Sending release failed with error %s(%d)\n", myproc.nspace, myproc.rank, (unsigned long)pid, PMIx_Error_string(rc), rc); goto done; } /* do some debugger magic while waiting for the job to terminate */ DEBUG_WAIT_THREAD(&myrel.lock); done: if (NULL != target) { free(target); } /* finalize us */ fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu: Finalizing\n", myproc.nspace, myproc.rank, (unsigned long)pid); if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) { fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc); } else { fprintf(stderr, "Debugger daemon ns %s rank %d pid %lu:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank, (unsigned long)pid); } fflush(stderr); return(0); }