Пример #1
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice);
	if (m==NULL) {
		printf("Error : %s",pbc_error(env));
		return;
	}
	printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL));
	printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL));
	printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL));

	int phone_n = pbc_rmessage_size(m, "phone");
	int i;
	const char * field_name;
	pbc_type(env, "tutorial.Person", "phone", &field_name);
	printf("phone type [%s]\n",field_name);

	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i);
		printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL));
		printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL));
	}

	int n = pbc_rmessage_size(m , "test");

	for (i=0;i<n;i++) {
		printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL));
	}

	printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL));
	pbc_rmessage_delete(m);
}
Пример #2
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice);
	printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL));
	printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL));
	printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL));

	int phone_n = pbc_rmessage_size(m, "phone");
	int i;

	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i);
		printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL));
		printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL));
	}

	int n = pbc_rmessage_size(m , "test");

	for (i=0;i<n;i++) {
		printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL));
	}

	printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL));
	pbc_rmessage_delete(m);
}
/**
 * launch a single container
 */
static int query_container_state_internal(hadoop_rpc_proxy_t* proxy, 
    int container_id,
    container_state_t* state,
    int* retval,
    char** diag_msg) {

    char* request = NULL;
    int request_len;
    int rc;

    rc = generate_query_container_state_request(&request, &request_len, proxy, container_id);
    if (0 != rc) {
        opal_output(0, "generate query_container_state_request failed.\n");
        return -1;
    }

    // send request
    rc = send_rpc_request(proxy, request, request_len);
    if (0 != rc) {
        opal_output(0, "send query_container_state_request failed.\n");
        free(request);
        return -1;
    }

    // now we will not use it anymore
    free(request);
    struct pbc_slice slice;

    // read response
    response_type_t response_type;
    response_type = recv_rpc_response(proxy, (char**)(&(slice.buffer)), &(slice.len));
    if (RESPONSE_SUCCEED == response_type) {
        // read response
        struct pbc_rmessage* rmsg = pbc_rmessage_new(env, "GetContainerStatusResponseProto", &slice);
        if (!rmsg) {
            opal_output(0, "deserialize GetContainerStatusResponseProto from buffer failed.\n");
            free(slice.buffer);
            return -1;
        }

        // get container status
        struct pbc_rmessage* status_msg = pbc_rmessage_message(rmsg, "status", 0);
        if (!status_msg) {
            opal_output(0, "get ContainerStatusProto from response failed.\n");
            return -1;
        }

        // read state
        *state = pbc_rmessage_integer(status_msg, "state", 0, NULL);
        *retval = pbc_rmessage_integer(status_msg, "exit_status", 0, NULL);

        pbc_rmessage_delete(rmsg);
        free(slice.buffer);
        return 0;
    } else {
        process_bad_rpc_response(proxy, response_type);
        return -1;
    }
}
Пример #4
0
static void
dump_value(struct pbc_rmessage *m, const char *key, int type, int idx, int level) {
	int i;
	for (i=0;i<level;i++) {
		printf("  ");
	}
	printf("%s",key);
	if (type & PBC_REPEATED) {
		printf("[%d]",idx);
		type -= PBC_REPEATED;
	}
	printf(" : ");

	uint32_t low;
	uint32_t hi;
	double real;
	const char *str;

	switch(type) {
	case PBC_INT:
		low = pbc_rmessage_integer(m, key, i, NULL);
		printf("%d", (int) low);
		break;
	case PBC_REAL:
		real = pbc_rmessage_real(m, key , i);
		printf("%lf", real);
		break;
	case PBC_BOOL:
		low = pbc_rmessage_integer(m, key, i, NULL);
		printf("%s", low ? "true" : "false");
		break;
	case PBC_ENUM:
		str = pbc_rmessage_string(m, key , i , NULL);
		printf("[%s]", str);
		break;
	case PBC_STRING:
		str = pbc_rmessage_string(m, key , i , NULL);
		printf("'%s'", str);
		break;
	case PBC_MESSAGE:
		printf("\n");
		dump_message(pbc_rmessage_message(m, key, i),level+1);
		return;
	case PBC_FIXED64:
		low = pbc_rmessage_integer(m, key, i, &hi);
		printf("0x%8x%8x",hi,low);
		break;
	case PBC_FIXED32:
		low = pbc_rmessage_integer(m, key, i, NULL);
		printf("0x%x",low);
		break;
	default:
		printf("unkown");
		break;
	}

	printf("\n");
}
Пример #5
0
static void
test(struct pbc_env *env) {
	// int i;
	// for(i=0; i<COUNT; i++)
	// {
	// 		struct pbc_wmessage* w_msg = pbc_wmessage_new(env, "at");
	// 		struct pbc_rmessage* r_msg = NULL;
	// 		struct pbc_slice sl;
	// 		char buffer[1024];
	// 		sl.buffer = buffer, sl.len = 1024;
	// 		pbc_wmessage_integer(w_msg, "aa", 123, 0);
	// 		pbc_wmessage_integer(w_msg, "bb", 456, 0);
	// 		pbc_wmessage_string(w_msg, "cc", "test string!", 0);
	// 		pbc_wmessage_buffer(w_msg, &sl);

	// 		r_msg = pbc_rmessage_new(env, "at", &sl);

	// 		//===============
	// 		printf("aa = %d\n", pbc_rmessage_integer(r_msg, "aa", 0, NULL));
	// 		printf("bb = %d\n", pbc_rmessage_integer(r_msg, "bb", 0, NULL));
	// 		printf("cc = %s\n", pbc_rmessage_string(r_msg, "cc", 0, NULL));
	// 		//===============

	// 		pbc_rmessage_delete(r_msg);
	// 		pbc_wmessage_delete(w_msg);
	// }

	struct pbc_wmessage *w_msg = pbc_wmessage_new(env, "at");
	pbc_wmessage_integer(w_msg, "aa", 123, 0);
	pbc_wmessage_integer(w_msg, "bb", 456, 0); 
	pbc_wmessage_string(w_msg, "cc", "test string!", 0);

	struct pbc_slice slice;
	char buffer[1024];
	slice.len = 1024;
	slice.buffer = buffer;
	pbc_wmessage_buffer(w_msg, &slice);
	// pbc_wmessage_delete(w_msg);  //!!!!!!!! should not delete here

	struct pbc_rmessage *r_msg = pbc_rmessage_new(env, "at", &slice);
	printf("aa = %d\n", pbc_rmessage_integer(r_msg, "aa", 0, NULL));
	printf("bb = %d\n", pbc_rmessage_integer(r_msg, "bb", 0, NULL));
	printf("cc = %s\n", pbc_rmessage_string(r_msg, "cc", 0, NULL));

	pbc_rmessage_delete(r_msg);
	pbc_wmessage_delete(w_msg);
}
Пример #6
0
static void
_register_field(struct pbc_rmessage * field, struct _field * f, struct _stringpool *pool) {
	f->id = pbc_rmessage_integer(field, "number", 0 , 0);
	f->type = pbc_rmessage_integer(field, "type", 0 , 0);	// enum
	f->label = pbc_rmessage_integer(field, "label", 0, 0) - 1; // LABEL_OPTIONAL = 0
	if (pbc_rmessage_size(field , "options") > 0) {
		struct pbc_rmessage * options = pbc_rmessage_message(field, "options" , 0);
		int packed = pbc_rmessage_integer(options , "packed" , 0 , NULL);
		if (packed) {
			f->label = LABEL_PACKED;
		}
	}
	f->type_name.n = pbc_rmessage_string(field, "type_name", 0 , NULL) +1;	// abandon prefix '.' 
	int vsz;
	const char * default_value = pbc_rmessage_string(field, "default_value", 0 , &vsz);
	_set_default(pool , f , f->type, default_value , vsz);
}
Пример #7
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "real", slice);
	printf("f = %f\n", pbc_rmessage_real(m , "f" , 0 ));
	printf("d = %f\n", pbc_rmessage_real(m , "d" , 0 ));
	printf("e = %ld\n", pbc_rmessage_integer(m , "e" , 0,NULL ));
	pbc_rmessage_delete(m);
}
Пример #8
0
static int
_rmessage_int32(lua_State *L) {
	struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1);
	const char * key = luaL_checkstring(L,2);
	int index = (int)luaL_checkinteger(L,3);
	uint32_t v = pbc_rmessage_integer(m, key, index, NULL);
	lua_pushlightuserdata(L,(void *)(intptr_t)v);

	return 1;
}
Пример #9
0
static int
_rmessage_int32(lua_State *L) {
	struct pbc_rmessage * m = lua_touserdata(L,1);
	const char * key = lua_tostring(L,2);
	int index = lua_tointeger(L,3);
	uint32_t v = pbc_rmessage_integer(m, key, index, NULL);
	lua_pushlightuserdata(L,(void *)v);

	return 1;
}
Пример #10
0
static int
_rmessage_integer(lua_State *L) {
	struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1);
	const char * key = luaL_checkstring(L,2);
	int index = luaL_checkinteger(L,3);
	int32_t v = (int32_t)pbc_rmessage_integer(m, key, index, NULL);

	lua_pushinteger(L,v);

	return 1;
}
Пример #11
0
static int
_rmessage_int(lua_State *L) {
	struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1);
	const char * key = luaL_checkstring(L,2);
	int index = luaL_checkinteger(L,3);
	uint32_t hi,low;
	low = pbc_rmessage_integer(m, key, index, &hi);
	int64_t v = (int64_t)((uint64_t)hi << 32 | (uint64_t)low);
	lua_pushinteger(L,v);

	return 1;
}
Пример #12
0
static int
_rmessage_int64(lua_State *L) {
	struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1);
	const char * key = luaL_checkstring(L,2);
	int index = (int)luaL_checkinteger(L,3);
	uint32_t v[2];
	v[0] = pbc_rmessage_integer(m, key, index, &v[1]);

	lua_pushlstring(L,(const char *)v,sizeof(v));

	return 1;
}
Пример #13
0
static void
_register_enum(struct pbc_env *p, struct _stringpool *pool, struct pbc_rmessage * enum_type, const char *prefix, int prefix_sz) {
	int field_count = pbc_rmessage_size(enum_type, "value");
	struct map_kv *table = malloc(field_count * sizeof(struct map_kv));
	int i;
	for (i=0;i<field_count;i++) {
		struct pbc_rmessage * value = pbc_rmessage_message(enum_type, "value", i);
		int enum_name_sz;
		const char *enum_name = pbc_rmessage_string(value , "name" , 0 , &enum_name_sz);
		table[i].pointer = (void *)_pbcS_build(pool, enum_name , enum_name_sz);
		table[i].id = pbc_rmessage_integer(value , "number", 0 , 0);
	}
	int name_sz;
	const char * name = pbc_rmessage_string(enum_type, "name", 0 , &name_sz);
	const char *temp = _concat_name(pool, prefix , prefix_sz , name , name_sz, NULL);

	_pbcP_push_enum(p,temp,table,field_count);
	free(table);
}
Пример #14
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "test", slice);
	if (m==NULL) {
		printf("Error : %s",pbc_error(env));
		return;
	}

	int phone_n = pbc_rmessage_size(m, "el");
	int i;


	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "el", i);
		printf("\tint16_min[%d] = %d\n",i,pbc_rmessage_integer(p , "int16_min", i ,NULL));
		printf("\tdouble_max[%d] = %f\n",i,pbc_rmessage_real(p, "double_max", i));
		printf("\tstring[%d] = %s\n",i,pbc_rmessage_string(p, "str", i, NULL));
	}

	pbc_rmessage_delete(m);
}
Пример #15
0
static int orte_ras_yarn_allocate_internal(int np, opal_list_t* nodes) {
    int rc, i;

    // create and send allocate message
    struct pbc_wmessage* msg = pbc_wmessage_new(orte_hdclient_pb_env, "AllocateRequestProto");
    if (!msg) {
        opal_output(0, "%s ras:yarn failed to create AllocateRequestProto", 
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }
    pbc_wmessage_integer(msg, "resource_count", np, 0);
    rc = orte_hdclient_send_message_and_delete(msg, HAMSTER_MSG_ALLOCATE);
    if (rc != 0) {
        opal_output(0, "%s ras:yarn error happened when send allocate msg to AM",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }

    // read rmessage out
    struct pbc_rmessage* response = orte_hdclient_recv_message("AllocateResponseProto");
    if (!response) {
        opal_output(0, "%s ras:yarn error happened when recv allocate response from AM",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }

    int n = pbc_rmessage_size(response, "node_resources");
    if (n <= 0) {
        opal_output(0, "%s ras:yarn got n(=%d) <= 0, please check",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n);
        return ORTE_ERROR;
    }

    // read node resources
    for (i = 0; i < n; i++) {
        struct pbc_rmessage* node_res = pbc_rmessage_message(response, "node_resources", i);
        if (!node_res) {
            opal_output(0, "%s ras:yarn error when parse returned resource from AM", 
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERROR;
        }
        
        // parse host, slot
        const char* host = pbc_rmessage_string(node_res, "host_name", 0, NULL);
        if (!host) {
            opal_output(0, "%s ras:yarn error when parse host from returned resource from AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERROR;
        }
        int slot = pbc_rmessage_integer(node_res, "slot", 0, NULL);

        // make node_t and add it to nodes
        orte_node_t* node = OBJ_NEW(orte_node_t);
        if (!node) {
            opal_output(0, "%s ras:yarn failed to create orte_node_t obj", 
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            return ORTE_ERROR;
        }

        node->name = strdup(host);
        node->state = ORTE_NODE_STATE_UP;
        node->slots_inuse = 0;
        node->slots_max = 0;
        node->slots = slot;
        opal_list_append(nodes, &node->super);

        OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
                     "%s ras:yarn: adding node %s with %d slot",
                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME),
                     host, slot));
    }

    // All done
    OPAL_OUTPUT_VERBOSE((1, orte_ras_base.ras_output,
                         "%s ras:yarn:allocate: success",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    return ORTE_SUCCESS;
}
Пример #16
0
static int common_launch_process(orte_job_t *jdata, bool launch_daemon, int *launched_proc_num)
{
	int i, rc;
	orte_proc_t* proc = NULL;
	char **argv;
	int argc;
	char **env;
	bool error_flag = false;
	int launched_num = 0;

	/* 1. create launch message */
	/*
	 message LaunchRequestProto {
	 repeated LaunchContextProto launch_contexts = 1;
	 }

	 message LaunchContextProto {
	 repeated string envars = 1;
	 optional string args = 2;
	 optional string host_name = 3;
	 optional ProcessNameProto name = 4;
	 }

	 message ProcessNameProto {
	 optional int32 jobid = 1;
	 optional int32 vpid = 2;
	 }
	 */
	struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "LaunchRequestProto");
	if (!request_msg) {
		opal_output(0, "%s plm:yarn:common_process_launch: failed to create AllocateRequestProto",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		return ORTE_ERROR;
	}

	/* when launch_daemon, start from 1 because we don't need launch HNP process */
	i = launch_daemon ? 1 : 0;

	for (; i < jdata->num_procs; i++) {
		argv = NULL;
		argc = 0;
		env = NULL;
		/* setup env/argv  */
		proc = opal_pointer_array_get_item(jdata->procs, i);
		if (!proc) {
			opal_output(0, "%s plm:yarn:common_launch_process: proc[%d] is NULL",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i);
			ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
		}

		if (launch_daemon) {
			rc = setup_daemon_proc_env_and_argv(proc, &argv, &argc, &env);
		} else {
			orte_app_context_t* app = (orte_app_context_t*) opal_pointer_array_get_item(jdata->apps, proc->app_idx);
			rc = setup_proc_env_and_argv(jdata, app, proc, &argv, &env);
		}
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_launch_process: failed to setup env/argv of proc[%d]",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i);
			ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
			error_flag = true;
			goto cleanup;
		}

		 /* print launch commandline and env when this env is specified */
		if (getenv("HAMSTER_VERBOSE")) {

			char* join_argv = opal_argv_join(argv, ' ');

			OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:yarn:common_launch_process: launch argv=%s",
							ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), join_argv));
			if (join_argv) {
				free(join_argv);
			}
		}

		OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
									"%s plm:yarn:common_launch_process: after setup env and argv for proc=%d.",
									ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));

		/* now start packing request_msg */
		struct pbc_wmessage *launch_contexts_msg = pbc_wmessage_message(request_msg, "launch_contexts");
		if (!launch_contexts_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: create launch_contexts_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		char **tmp_env = env;
		while (*tmp_env) {
			pbc_wmessage_string(launch_contexts_msg, "envars", *tmp_env, strlen(*tmp_env));
			tmp_env++;
		}

		char* join_argv = opal_argv_join(argv, ' ');
		pbc_wmessage_string(launch_contexts_msg, "args", join_argv, strlen(join_argv));

		pbc_wmessage_string(launch_contexts_msg, "host_name", proc->node->name, strlen(proc->node->name));

		struct pbc_wmessage *proccess_name_msg = pbc_wmessage_message(launch_contexts_msg, "name");
		if (!proccess_name_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: create proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		rc = pbc_wmessage_integer(proccess_name_msg, "jobid", ORTE_LOCAL_JOBID(proc->name.jobid), 0);
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: pack jobid in proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		rc = pbc_wmessage_integer(proccess_name_msg, "vpid", proc->name.vpid,
				0);
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: pack vpid in proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

cleanup:
		/* free argv and env for this proc */
		if (argv) {
			opal_argv_free(argv);
		}
		if (env) {
			opal_argv_free(env);
		}
		if (join_argv) {
			free(join_argv);
		}
		if (error_flag) {
			pbc_wmessage_delete(request_msg);
			ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
			return ORTE_ERROR;
		}
	}

	/* 2. send launch deamon procs request msg */
	rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_LAUNCH);
	if (rc != 0) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: error happened when send launch proc request to AM",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		if (request_msg) {
			pbc_wmessage_delete(request_msg);
		}
		return ORTE_ERROR;
	}

	/* 3. recv response and parse the msg*/
	/*
	 message LaunchResponseProto {
	 repeated LaunchResultProto results = 1;
	 }

	 message LaunchResultProto {
	 optional ProcessNameProto name = 1;
	 optional bool success = 2;
	 }

	 message ProcessNameProto {
	 optional int32 jobid = 1;
	 optional int32 vpid = 2;
	 }
	 */
	struct pbc_rmessage* response_msg = NULL;
	response_msg = orte_hdclient_recv_message("LaunchResponseProto");
	if (!response_msg) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: error happened when recv launch response msg from AM",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		goto launch_failed;
	}

	int n = pbc_rmessage_size(response_msg, "results");
	if (n < 0) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: got n(=%d) < 0, please check",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n);
		goto launch_failed;
	}

	for (i = 0; i < n; i++) {
		struct pbc_rmessage* results_msg = pbc_rmessage_message(response_msg, "results", i);
		if (!results_msg) {
			opal_output(0,
					"%s plm:yarn:launch_daemons: error when parse returned launch results from AM",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			goto launch_failed;
		}

		struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(results_msg, "name", 0);
		if (!proc_name_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: error when parse returned proc_name_msg from AM",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			goto launch_failed;
		}

		orte_jobid_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL);
		orte_vpid_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL);

		bool success = pbc_rmessage_integer(results_msg, "success", 0, NULL);

		orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(jdata->procs, vpid);
		if (success) {
			proc->state = ORTE_PROC_STATE_RUNNING;
			launched_num++;
		} else {
			opal_output(0,
					"%s plm:yarn:common_process_launch: launch proc failed when jobid = %u, vpid = %u",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_jobid, vpid);
			proc->state = ORTE_PROC_STATE_FAILED_TO_START;
			jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
			goto launch_failed;
		}
	}

	/* to return back */
	*launched_proc_num = launched_num;
	return ORTE_SUCCESS;

launch_failed:
	    if (response_msg) {
	        pbc_rmessage_delete(response_msg);
	    }
	    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
	    return ORTE_ERROR;
}
Пример #17
0
static void heartbeat_with_AM_cb(int fd, short event, void *data)
{
    int i, rc;
    orte_job_t *jdata = (orte_job_t*)data;
    orte_job_t* daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);

    /* 1. create heartbeat request msg */
    /*
    message HeartbeatRequestProto {
    }
    */
    struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "HeartbeatRequestProto");
    if (!request_msg) {
        opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: failed to create request_msg",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }

    /* 2. send heartbeat request msg */
    rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_HEARTBEAT);
    if (rc != 0) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: error happened when send request_msg to AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }

    /* 3. recv response and parse the msg*/
    /*
     message HeartbeatResponseProto {
         repeated ProcessStatusProto completed_processes = 1;
     }

     message ProcessStatusProto {
         optional ProcessNameProto name = 1;
         optional ProcessStateProto state = 2;
         optional int32 exit_value = 3;
     }

     enum ProcessStateProto {
         RUNNING = 1;
         COMPLETED = 2;
     }

     message ProcessNameProto {
         optional int32 jobid = 1;
         optional int32 vpid = 2;
     }
     */

    struct pbc_rmessage* response_msg = orte_hdclient_recv_message("HeartbeatResponseProto");
    if (!response_msg) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: error happened when recv HeartbeatResponseProto msg from AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    int n = pbc_rmessage_size(response_msg, "completed_processes");
    if (n < 0) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: got n(=%d) < 0, please check",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n);
        goto cleanup;
    }

    for (i = 0; i < n; i++) {
        struct pbc_rmessage* completed_procs_msg = pbc_rmessage_message(response_msg, "completed_processes", i);
        if (!completed_procs_msg) {
            opal_output(0,
                    "%s plm:yarn:heartbeat_with_AM_cb: error when parse returned completed_procs_msg from AM",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            goto cleanup;
        }

        struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(completed_procs_msg, "name", 0);
        if (!proc_name_msg) {
            opal_output(0,
                    "%s plm:yarn:heartbeat_with_AM_cb: error when parse proc_name_msg",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            goto cleanup;
        }

        uint32_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL);
        uint32_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL);

        uint32_t exit_value = pbc_rmessage_integer(completed_procs_msg, "exit_value", 0, NULL);

        /* next, we will modify proc's state */
        orte_job_t* tmp_jdata = (orte_job_t*) opal_pointer_array_get_item(orte_job_data, local_jobid);
        orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(tmp_jdata->procs, vpid);


        if (tmp_jdata->jobid == jdata->jobid) {
			num_completed_jdata_procs++;
		}

        if (exit_value == 0) {
        	proc->state = ORTE_PROC_STATE_TERMINATED;
        }

        /* if this process is already terminated, just skip over */
        if (proc->state >= ORTE_PROC_STATE_TERMINATED) {
            continue;
        }

        if (exit_value == -1000 || exit_value == -100 || exit_value == -101) {
            opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb proc failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_ERROR_LOG(ORTE_ERROR);
            proc->state = ORTE_PROC_STATE_FAILED_TO_START;
            ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
        } else {
            /* here, means currently the proc's state < ORTE_PROC_STATE_TERMINATED,
             * however, from AM's heartbeat response, we got the proc's container is terminated,
             * to solve this dilemma , we set a timer event to reconfirm this proc's state,
             */
            opal_event_t *ev = NULL;
            ev = (opal_event_t*) malloc(sizeof(opal_event_t));

            struct timeval delay;
            delay.tv_sec = 15;
            delay.tv_usec = 0;

            opal_event_evtimer_set(orte_event_base, ev, process_state_monitor_cb, proc);
            opal_event_evtimer_add(ev, &delay);
        }
    }

cleanup:
    if (response_msg) {
        pbc_rmessage_delete(response_msg);
    }

    if (num_completed_jdata_procs == jdata->num_procs) {
        /*
         * all procs are completed, send finish request to AM,
         * modify job state to ORTE_JOB_STATE_TERMINATED
         */
        jdata->state = ORTE_JOB_STATE_TERMINATED;
        finish_app_master(0 == orte_exit_status);
        return;
    } else {
        /* next heartbeat */
        opal_event_t *ev = NULL;
        ev = (opal_event_t*) malloc(sizeof(opal_event_t));

        struct timeval delay;
        delay.tv_sec = 1;
        delay.tv_usec = 0;

        opal_event_evtimer_set(orte_event_base, ev, heartbeat_with_AM_cb, jdata);
		opal_event_evtimer_add(ev, &delay);
    }
}