static void test_rmessage(struct pbc_env *env, struct pbc_slice *slice) { struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice); printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL)); printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL)); printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL)); int phone_n = pbc_rmessage_size(m, "phone"); int i; for (i=0;i<phone_n;i++) { struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i); printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL)); printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL)); } int n = pbc_rmessage_size(m , "test"); for (i=0;i<n;i++) { printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL)); } printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL)); pbc_rmessage_delete(m); }
static void _register(struct pbc_env *p, struct pbc_rmessage * file, struct _stringpool *pool) { int package_sz; const char *package = pbc_rmessage_string(file, "package", 0, &package_sz); pbc_array queue; _pbcA_open(queue); int enum_count = pbc_rmessage_size(file, "enum_type"); int i; for (i=0;i<enum_count;i++) { struct pbc_rmessage * enum_type = pbc_rmessage_message(file, "enum_type", i); _register_enum(p, pool , enum_type, package, package_sz); } int message_count = pbc_rmessage_size(file, "message_type"); for (i=0;i<message_count;i++) { struct pbc_rmessage * message_type = pbc_rmessage_message(file, "message_type", i); _register_message(p, pool, message_type, package, package_sz, queue); } _register_extension(p, pool, package, package_sz, file , queue); _pbcB_register_fields(p, queue); _pbcA_close(queue); }
static void test_rmessage(struct pbc_env *env, struct pbc_slice *slice) { struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice); if (m==NULL) { printf("Error : %s",pbc_error(env)); return; } printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL)); printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL)); printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL)); int phone_n = pbc_rmessage_size(m, "phone"); int i; const char * field_name; pbc_type(env, "tutorial.Person", "phone", &field_name); printf("phone type [%s]\n",field_name); for (i=0;i<phone_n;i++) { struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i); printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL)); printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL)); } int n = pbc_rmessage_size(m , "test"); for (i=0;i<n;i++) { printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL)); } printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL)); pbc_rmessage_delete(m); }
static void _register_extension(struct pbc_env *p, struct _stringpool *pool , const char * prefix, int prefix_sz, struct pbc_rmessage * msg, pbc_array queue) { int extension_count = pbc_rmessage_size(msg , "extension"); if (extension_count <= 0) return; int i; const char * last = NULL; for (i=0;i<extension_count;i++) { struct pbc_rmessage * extension = pbc_rmessage_message(msg, "extension", i); int field_name_sz = 0; struct _field f; const char * field_name = pbc_rmessage_string(extension , "name" , 0, &field_name_sz); f.name = _concat_name(pool, prefix, prefix_sz, field_name, field_name_sz, NULL); _register_field(extension, &f , pool); const char * extendee = pbc_rmessage_string(extension , "extendee" , 0, NULL); _pbcP_push_message(p, extendee + 1 , &f , queue); if (last == NULL) { last = extendee; } else if (strcmp(extendee,last) != 0) { _pbcP_init_message(p, last+1); last = extendee; } } _pbcP_init_message(p, last+1); }
static int _rmessage_size(lua_State *L) { struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1); const char * key = luaL_checkstring(L,2); int sz = pbc_rmessage_size(m, key); lua_pushinteger(L, sz); return 1; }
static void _register_message(struct pbc_env *p, struct _stringpool *pool, struct pbc_rmessage * message_type, const char *prefix, int prefix_sz, pbc_array queue) { int name_sz; const char * name = pbc_rmessage_string(message_type, "name", 0 , &name_sz); int sz = 0; const char *temp = _concat_name(pool, prefix , prefix_sz , name , name_sz, &sz); int field_count = pbc_rmessage_size(message_type, "field"); int i; for (i=0;i<field_count;i++) { struct pbc_rmessage * field = pbc_rmessage_message(message_type, "field" , i); struct _field f; int field_name_sz; const char * field_name = pbc_rmessage_string(field, "name", 0 , &field_name_sz); f.name = _pbcS_build(pool,field_name,field_name_sz); _register_field(field, &f , pool); _pbcP_push_message(p, temp , &f , queue); } _pbcP_init_message(p, temp); _register_extension(p, pool, temp, sz,message_type, queue); // nested enum int enum_count = pbc_rmessage_size(message_type, "enum_type"); for (i=0;i<enum_count;i++) { struct pbc_rmessage * enum_type = pbc_rmessage_message(message_type, "enum_type", i); _register_enum(p, pool, enum_type, temp, sz); } // nested type int message_count = pbc_rmessage_size(message_type, "nested_type"); for (i=0;i<message_count;i++) { struct pbc_rmessage * nested_type = pbc_rmessage_message(message_type, "nested_type", i); _register_message(p, pool, nested_type, temp, sz, queue); } }
static void _register_field(struct pbc_rmessage * field, struct _field * f, struct _stringpool *pool) { f->id = pbc_rmessage_integer(field, "number", 0 , 0); f->type = pbc_rmessage_integer(field, "type", 0 , 0); // enum f->label = pbc_rmessage_integer(field, "label", 0, 0) - 1; // LABEL_OPTIONAL = 0 if (pbc_rmessage_size(field , "options") > 0) { struct pbc_rmessage * options = pbc_rmessage_message(field, "options" , 0); int packed = pbc_rmessage_integer(options , "packed" , 0 , NULL); if (packed) { f->label = LABEL_PACKED; } } f->type_name.n = pbc_rmessage_string(field, "type_name", 0 , NULL) +1; // abandon prefix '.' int vsz; const char * default_value = pbc_rmessage_string(field, "default_value", 0 , &vsz); _set_default(pool , f , f->type, default_value , vsz); }
static void _register_enum(struct pbc_env *p, struct _stringpool *pool, struct pbc_rmessage * enum_type, const char *prefix, int prefix_sz) { int field_count = pbc_rmessage_size(enum_type, "value"); struct map_kv *table = malloc(field_count * sizeof(struct map_kv)); int i; for (i=0;i<field_count;i++) { struct pbc_rmessage * value = pbc_rmessage_message(enum_type, "value", i); int enum_name_sz; const char *enum_name = pbc_rmessage_string(value , "name" , 0 , &enum_name_sz); table[i].pointer = (void *)_pbcS_build(pool, enum_name , enum_name_sz); table[i].id = pbc_rmessage_integer(value , "number", 0 , 0); } int name_sz; const char * name = pbc_rmessage_string(enum_type, "name", 0 , &name_sz); const char *temp = _concat_name(pool, prefix , prefix_sz , name , name_sz, NULL); _pbcP_push_enum(p,temp,table,field_count); free(table); }
int pbc_register(struct pbc_env * p, struct pbc_slice *slice) { //dump((uint8_t *)slice->buffer, slice->len); struct pbc_rmessage * message = pbc_rmessage_new(p, "google.protobuf.FileDescriptorSet", slice); if (message == NULL) { p->lasterror = "register open google.protobuf.FileDescriptorSet fail"; return 1; } int n = pbc_rmessage_size(message, "file"); struct pbc_rmessage ** files = (struct pbc_rmessage **)_pbcM_malloc(n*(sizeof(struct pbc_rmessage *))); int i; int r = n; if (n == 0) { p->lasterror = "register empty"; goto _error; } for (i=0;i<n;i++) { files[i] = pbc_rmessage_message(message, "file", i); if (files[i] == NULL) { p->lasterror = "register open fail"; goto _error; } } do { int rr = _register_no_dependency(p,files , n); if (rr == r) { p->lasterror = "register dependency error"; goto _error; } r = rr; } while (r>0); pbc_rmessage_delete(message); free(files); return 0; _error: pbc_rmessage_delete(message); free(files); return 1; }
static void dump_message(struct pbc_rmessage *m, int level) { int t = 0; const char *key = NULL; for (;;) { t = pbc_rmessage_next(m, &key); if (key == NULL) break; if (t & PBC_REPEATED) { int n = pbc_rmessage_size(m, key); int i; for (i=0;i<n;i++) { dump_value(m, key , t , i , level); } } else { dump_value(m, key , t , 0 , level); } } }
static int _check_file_name(struct pbc_env * p , struct pbc_rmessage * file, const char ** fname) { const char * filename = pbc_rmessage_string(file, "name", 0, NULL); // printf("reg :%s\n",filename); if (_pbcM_sp_query(p->files, filename)) { return CHECK_FILE_EXIST; } int sz = pbc_rmessage_size(file, "dependency"); int i; for (i=0;i<sz;i++) { const char *dname = pbc_rmessage_string(file,"dependency",i,NULL); // printf("dependency :%s\n",dname); if (_pbcM_sp_query(p->files, dname) == NULL) { return CHECK_FILE_DEPENDENCY; } } *fname = filename; return CHECK_FILE_OK; }
static void test_rmessage(struct pbc_env *env, struct pbc_slice *slice) { struct pbc_rmessage * m = pbc_rmessage_new(env, "test", slice); if (m==NULL) { printf("Error : %s",pbc_error(env)); return; } int phone_n = pbc_rmessage_size(m, "el"); int i; for (i=0;i<phone_n;i++) { struct pbc_rmessage * p = pbc_rmessage_message(m , "el", i); printf("\tint16_min[%d] = %d\n",i,pbc_rmessage_integer(p , "int16_min", i ,NULL)); printf("\tdouble_max[%d] = %f\n",i,pbc_rmessage_real(p, "double_max", i)); printf("\tstring[%d] = %s\n",i,pbc_rmessage_string(p, "str", i, NULL)); } pbc_rmessage_delete(m); }
int pbc_register(struct pbc_env * p, struct pbc_slice *slice) { struct pbc_rmessage * message = pbc_rmessage_new(p, "google.protobuf.FileDescriptorSet", slice); if (message == NULL) { p->lasterror = "register open google.protobuf.FileDescriptorSet fail"; return 1; } int n = pbc_rmessage_size(message, "file"); struct pbc_rmessage * files[n]; int i; if (n == 0) { p->lasterror = "register empty"; goto _error; } for (i=0;i<n;i++) { files[i] = pbc_rmessage_message(message, "file", i); if (files[i] == NULL) { p->lasterror = "register open fail"; goto _error; } } int r = n; do { int rr = _register_no_dependency(p,files , n); if (rr == r) { p->lasterror = "register dependency error"; goto _error; } r = rr; } while (r>0); pbc_rmessage_delete(message); return 0; _error: pbc_rmessage_delete(message); return 1; }
static int orte_ras_yarn_allocate_internal(int np, opal_list_t* nodes) { int rc, i; // create and send allocate message struct pbc_wmessage* msg = pbc_wmessage_new(orte_hdclient_pb_env, "AllocateRequestProto"); if (!msg) { opal_output(0, "%s ras:yarn failed to create AllocateRequestProto", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } pbc_wmessage_integer(msg, "resource_count", np, 0); rc = orte_hdclient_send_message_and_delete(msg, HAMSTER_MSG_ALLOCATE); if (rc != 0) { opal_output(0, "%s ras:yarn error happened when send allocate msg to AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } // read rmessage out struct pbc_rmessage* response = orte_hdclient_recv_message("AllocateResponseProto"); if (!response) { opal_output(0, "%s ras:yarn error happened when recv allocate response from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } int n = pbc_rmessage_size(response, "node_resources"); if (n <= 0) { opal_output(0, "%s ras:yarn got n(=%d) <= 0, please check", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n); return ORTE_ERROR; } // read node resources for (i = 0; i < n; i++) { struct pbc_rmessage* node_res = pbc_rmessage_message(response, "node_resources", i); if (!node_res) { opal_output(0, "%s ras:yarn error when parse returned resource from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } // parse host, slot const char* host = pbc_rmessage_string(node_res, "host_name", 0, NULL); if (!host) { opal_output(0, "%s ras:yarn error when parse host from returned resource from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } int slot = pbc_rmessage_integer(node_res, "slot", 0, NULL); // make node_t and add it to nodes orte_node_t* node = OBJ_NEW(orte_node_t); if (!node) { opal_output(0, "%s ras:yarn failed to create orte_node_t obj", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } node->name = strdup(host); node->state = ORTE_NODE_STATE_UP; node->slots_inuse = 0; node->slots_max = 0; node->slots = slot; opal_list_append(nodes, &node->super); OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output, "%s ras:yarn: adding node %s with %d slot", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), host, slot)); } // All done OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output, "%s ras:yarn:allocate: success", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME))); return ORTE_SUCCESS; }
static int common_launch_process(orte_job_t *jdata, bool launch_daemon, int *launched_proc_num) { int i, rc; orte_proc_t* proc = NULL; char **argv; int argc; char **env; bool error_flag = false; int launched_num = 0; /* 1. create launch message */ /* message LaunchRequestProto { repeated LaunchContextProto launch_contexts = 1; } message LaunchContextProto { repeated string envars = 1; optional string args = 2; optional string host_name = 3; optional ProcessNameProto name = 4; } message ProcessNameProto { optional int32 jobid = 1; optional int32 vpid = 2; } */ struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "LaunchRequestProto"); if (!request_msg) { opal_output(0, "%s plm:yarn:common_process_launch: failed to create AllocateRequestProto", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); return ORTE_ERROR; } /* when launch_daemon, start from 1 because we don't need launch HNP process */ i = launch_daemon ? 1 : 0; for (; i < jdata->num_procs; i++) { argv = NULL; argc = 0; env = NULL; /* setup env/argv */ proc = opal_pointer_array_get_item(jdata->procs, i); if (!proc) { opal_output(0, "%s plm:yarn:common_launch_process: proc[%d] is NULL", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); } if (launch_daemon) { rc = setup_daemon_proc_env_and_argv(proc, &argv, &argc, &env); } else { orte_app_context_t* app = (orte_app_context_t*) opal_pointer_array_get_item(jdata->apps, proc->app_idx); rc = setup_proc_env_and_argv(jdata, app, proc, &argv, &env); } if (0 != rc) { opal_output(0, "%s plm:yarn:common_launch_process: failed to setup env/argv of proc[%d]", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); error_flag = true; goto cleanup; } /* print launch commandline and env when this env is specified */ if (getenv("HAMSTER_VERBOSE")) { char* join_argv = opal_argv_join(argv, ' '); OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:yarn:common_launch_process: launch argv=%s", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), join_argv)); if (join_argv) { free(join_argv); } } OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:yarn:common_launch_process: after setup env and argv for proc=%d.", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i)); /* now start packing request_msg */ struct pbc_wmessage *launch_contexts_msg = pbc_wmessage_message(request_msg, "launch_contexts"); if (!launch_contexts_msg) { opal_output(0, "%s plm:yarn:common_process_launch: create launch_contexts_msg failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); error_flag = true; goto cleanup; } char **tmp_env = env; while (*tmp_env) { pbc_wmessage_string(launch_contexts_msg, "envars", *tmp_env, strlen(*tmp_env)); tmp_env++; } char* join_argv = opal_argv_join(argv, ' '); pbc_wmessage_string(launch_contexts_msg, "args", join_argv, strlen(join_argv)); pbc_wmessage_string(launch_contexts_msg, "host_name", proc->node->name, strlen(proc->node->name)); struct pbc_wmessage *proccess_name_msg = pbc_wmessage_message(launch_contexts_msg, "name"); if (!proccess_name_msg) { opal_output(0, "%s plm:yarn:common_process_launch: create proccess_name_msg failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); error_flag = true; goto cleanup; } rc = pbc_wmessage_integer(proccess_name_msg, "jobid", ORTE_LOCAL_JOBID(proc->name.jobid), 0); if (0 != rc) { opal_output(0, "%s plm:yarn:common_process_launch: pack jobid in proccess_name_msg failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); error_flag = true; goto cleanup; } rc = pbc_wmessage_integer(proccess_name_msg, "vpid", proc->name.vpid, 0); if (0 != rc) { opal_output(0, "%s plm:yarn:common_process_launch: pack vpid in proccess_name_msg failed", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); error_flag = true; goto cleanup; } cleanup: /* free argv and env for this proc */ if (argv) { opal_argv_free(argv); } if (env) { opal_argv_free(env); } if (join_argv) { free(join_argv); } if (error_flag) { pbc_wmessage_delete(request_msg); ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERROR; } } /* 2. send launch deamon procs request msg */ rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_LAUNCH); if (rc != 0) { opal_output(0, "%s plm:yarn:common_process_launch: error happened when send launch proc request to AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); if (request_msg) { pbc_wmessage_delete(request_msg); } return ORTE_ERROR; } /* 3. recv response and parse the msg*/ /* message LaunchResponseProto { repeated LaunchResultProto results = 1; } message LaunchResultProto { optional ProcessNameProto name = 1; optional bool success = 2; } message ProcessNameProto { optional int32 jobid = 1; optional int32 vpid = 2; } */ struct pbc_rmessage* response_msg = NULL; response_msg = orte_hdclient_recv_message("LaunchResponseProto"); if (!response_msg) { opal_output(0, "%s plm:yarn:common_process_launch: error happened when recv launch response msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto launch_failed; } int n = pbc_rmessage_size(response_msg, "results"); if (n < 0) { opal_output(0, "%s plm:yarn:common_process_launch: got n(=%d) < 0, please check", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n); goto launch_failed; } for (i = 0; i < n; i++) { struct pbc_rmessage* results_msg = pbc_rmessage_message(response_msg, "results", i); if (!results_msg) { opal_output(0, "%s plm:yarn:launch_daemons: error when parse returned launch results from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto launch_failed; } struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(results_msg, "name", 0); if (!proc_name_msg) { opal_output(0, "%s plm:yarn:common_process_launch: error when parse returned proc_name_msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto launch_failed; } orte_jobid_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL); orte_vpid_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL); bool success = pbc_rmessage_integer(results_msg, "success", 0, NULL); orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(jdata->procs, vpid); if (success) { proc->state = ORTE_PROC_STATE_RUNNING; launched_num++; } else { opal_output(0, "%s plm:yarn:common_process_launch: launch proc failed when jobid = %u, vpid = %u", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_jobid, vpid); proc->state = ORTE_PROC_STATE_FAILED_TO_START; jdata->state = ORTE_JOB_STATE_FAILED_TO_START; goto launch_failed; } } /* to return back */ *launched_proc_num = launched_num; return ORTE_SUCCESS; launch_failed: if (response_msg) { pbc_rmessage_delete(response_msg); } ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE); return ORTE_ERROR; }
static void heartbeat_with_AM_cb(int fd, short event, void *data) { int i, rc; orte_job_t *jdata = (orte_job_t*)data; orte_job_t* daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid); /* 1. create heartbeat request msg */ /* message HeartbeatRequestProto { } */ struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "HeartbeatRequestProto"); if (!request_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: failed to create request_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* 2. send heartbeat request msg */ rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_HEARTBEAT); if (rc != 0) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error happened when send request_msg to AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE); return; } /* 3. recv response and parse the msg*/ /* message HeartbeatResponseProto { repeated ProcessStatusProto completed_processes = 1; } message ProcessStatusProto { optional ProcessNameProto name = 1; optional ProcessStateProto state = 2; optional int32 exit_value = 3; } enum ProcessStateProto { RUNNING = 1; COMPLETED = 2; } message ProcessNameProto { optional int32 jobid = 1; optional int32 vpid = 2; } */ struct pbc_rmessage* response_msg = orte_hdclient_recv_message("HeartbeatResponseProto"); if (!response_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error happened when recv HeartbeatResponseProto msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } int n = pbc_rmessage_size(response_msg, "completed_processes"); if (n < 0) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: got n(=%d) < 0, please check", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n); goto cleanup; } for (i = 0; i < n; i++) { struct pbc_rmessage* completed_procs_msg = pbc_rmessage_message(response_msg, "completed_processes", i); if (!completed_procs_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error when parse returned completed_procs_msg from AM", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(completed_procs_msg, "name", 0); if (!proc_name_msg) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: error when parse proc_name_msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); goto cleanup; } uint32_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL); uint32_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL); uint32_t exit_value = pbc_rmessage_integer(completed_procs_msg, "exit_value", 0, NULL); /* next, we will modify proc's state */ orte_job_t* tmp_jdata = (orte_job_t*) opal_pointer_array_get_item(orte_job_data, local_jobid); orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(tmp_jdata->procs, vpid); if (tmp_jdata->jobid == jdata->jobid) { num_completed_jdata_procs++; } if (exit_value == 0) { proc->state = ORTE_PROC_STATE_TERMINATED; } /* if this process is already terminated, just skip over */ if (proc->state >= ORTE_PROC_STATE_TERMINATED) { continue; } if (exit_value == -1000 || exit_value == -100 || exit_value == -101) { opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb proc failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)); ORTE_ERROR_LOG(ORTE_ERROR); proc->state = ORTE_PROC_STATE_FAILED_TO_START; ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH); } else { /* here, means currently the proc's state < ORTE_PROC_STATE_TERMINATED, * however, from AM's heartbeat response, we got the proc's container is terminated, * to solve this dilemma , we set a timer event to reconfirm this proc's state, */ opal_event_t *ev = NULL; ev = (opal_event_t*) malloc(sizeof(opal_event_t)); struct timeval delay; delay.tv_sec = 15; delay.tv_usec = 0; opal_event_evtimer_set(orte_event_base, ev, process_state_monitor_cb, proc); opal_event_evtimer_add(ev, &delay); } } cleanup: if (response_msg) { pbc_rmessage_delete(response_msg); } if (num_completed_jdata_procs == jdata->num_procs) { /* * all procs are completed, send finish request to AM, * modify job state to ORTE_JOB_STATE_TERMINATED */ jdata->state = ORTE_JOB_STATE_TERMINATED; finish_app_master(0 == orte_exit_status); return; } else { /* next heartbeat */ opal_event_t *ev = NULL; ev = (opal_event_t*) malloc(sizeof(opal_event_t)); struct timeval delay; delay.tv_sec = 1; delay.tv_usec = 0; opal_event_evtimer_set(orte_event_base, ev, heartbeat_with_AM_cb, jdata); opal_event_evtimer_add(ev, &delay); } }