Пример #1
0
static int
_rmessage_delete(lua_State *L) {
	struct pbc_rmessage * m = (struct pbc_rmessage *)checkuserdata(L,1);
	pbc_rmessage_delete(m);

	return 0;
}
Пример #2
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice);
	printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL));
	printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL));
	printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL));

	int phone_n = pbc_rmessage_size(m, "phone");
	int i;

	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i);
		printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL));
		printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL));
	}

	int n = pbc_rmessage_size(m , "test");

	for (i=0;i<n;i++) {
		printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL));
	}

	printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL));
	pbc_rmessage_delete(m);
}
Пример #3
0
static int register_to_am() {
    int rc;
    struct pbc_wmessage* msg = pbc_wmessage_new(orte_hdclient_pb_env, "RegisterRequestProto");
    if (!msg) {
        opal_output(0, "%s ras:yarn failed to create register msg", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }

    rc = orte_hdclient_send_message_and_delete(msg, HAMSTER_MSG_REGISTER);
    if (rc != 0) {
        opal_output(0, "%s ras:yarn error happened when send register to AM",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }

    // read rmessage out
    struct pbc_rmessage* response = orte_hdclient_recv_message("RegisterResponseProto");
    if (!response) {
        opal_output(0, "%s ras:yarn error happened when recv register response from AM",
            ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        return ORTE_ERROR;
    }
    pbc_rmessage_delete(response);

    OPAL_OUTPUT_VERBOSE((5, orte_ras_base.ras_output,
                     "%s ras:yarn successfully registered to AM",
                     ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    return ORTE_SUCCESS;
}
Пример #4
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "tutorial.Person", slice);
	if (m==NULL) {
		printf("Error : %s",pbc_error(env));
		return;
	}
	printf("name = %s\n", pbc_rmessage_string(m , "name" , 0 , NULL));
	printf("id = %d\n", pbc_rmessage_integer(m , "id" , 0 , NULL));
	printf("email = %s\n", pbc_rmessage_string(m , "email" , 0 , NULL));

	int phone_n = pbc_rmessage_size(m, "phone");
	int i;
	const char * field_name;
	pbc_type(env, "tutorial.Person", "phone", &field_name);
	printf("phone type [%s]\n",field_name);

	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "phone", i);
		printf("\tnumber[%d] = %s\n",i,pbc_rmessage_string(p , "number", i ,NULL));
		printf("\ttype[%d] = %s\n",i,pbc_rmessage_string(p, "type", i, NULL));
	}

	int n = pbc_rmessage_size(m , "test");

	for (i=0;i<n;i++) {
		printf("test[%d] = %d\n",i, pbc_rmessage_integer(m , "test" , i , NULL));
	}

	printf("tutorial.Ext.test = %d\n", pbc_rmessage_integer(m,"tutorial.Ext.test",0,NULL));
	pbc_rmessage_delete(m);
}
/**
 * launch a single container
 */
static int query_container_state_internal(hadoop_rpc_proxy_t* proxy, 
    int container_id,
    container_state_t* state,
    int* retval,
    char** diag_msg) {

    char* request = NULL;
    int request_len;
    int rc;

    rc = generate_query_container_state_request(&request, &request_len, proxy, container_id);
    if (0 != rc) {
        opal_output(0, "generate query_container_state_request failed.\n");
        return -1;
    }

    // send request
    rc = send_rpc_request(proxy, request, request_len);
    if (0 != rc) {
        opal_output(0, "send query_container_state_request failed.\n");
        free(request);
        return -1;
    }

    // now we will not use it anymore
    free(request);
    struct pbc_slice slice;

    // read response
    response_type_t response_type;
    response_type = recv_rpc_response(proxy, (char**)(&(slice.buffer)), &(slice.len));
    if (RESPONSE_SUCCEED == response_type) {
        // read response
        struct pbc_rmessage* rmsg = pbc_rmessage_new(env, "GetContainerStatusResponseProto", &slice);
        if (!rmsg) {
            opal_output(0, "deserialize GetContainerStatusResponseProto from buffer failed.\n");
            free(slice.buffer);
            return -1;
        }

        // get container status
        struct pbc_rmessage* status_msg = pbc_rmessage_message(rmsg, "status", 0);
        if (!status_msg) {
            opal_output(0, "get ContainerStatusProto from response failed.\n");
            return -1;
        }

        // read state
        *state = pbc_rmessage_integer(status_msg, "state", 0, NULL);
        *retval = pbc_rmessage_integer(status_msg, "exit_status", 0, NULL);

        pbc_rmessage_delete(rmsg);
        free(slice.buffer);
        return 0;
    } else {
        process_bad_rpc_response(proxy, response_type);
        return -1;
    }
}
Пример #6
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "real", slice);
	printf("f = %f\n", pbc_rmessage_real(m , "f" , 0 ));
	printf("d = %f\n", pbc_rmessage_real(m , "d" , 0 ));
	printf("e = %ld\n", pbc_rmessage_integer(m , "e" , 0,NULL ));
	pbc_rmessage_delete(m);
}
Пример #7
0
int
pbc_register(struct pbc_env * p, struct pbc_slice *slice) {
	//dump((uint8_t *)slice->buffer, slice->len);
	struct pbc_rmessage * message = pbc_rmessage_new(p, "google.protobuf.FileDescriptorSet", slice);
	if (message == NULL) {
		p->lasterror = "register open google.protobuf.FileDescriptorSet fail";
		return 1;
	}
	int n = pbc_rmessage_size(message, "file");

	struct pbc_rmessage ** files = (struct pbc_rmessage **)_pbcM_malloc(n*(sizeof(struct pbc_rmessage *)));
	int i;
    int r = n;
	if (n == 0) {
		p->lasterror = "register empty";
		goto _error;
	}
	for (i=0;i<n;i++) {
		files[i] = pbc_rmessage_message(message, "file", i);
		if (files[i] == NULL) {
			p->lasterror = "register open fail";
			goto _error;
		}
	}

	
	do {
		int rr = _register_no_dependency(p,files , n);
		if (rr == r) {
			p->lasterror = "register dependency error";
			goto _error;
		}
		r = rr;
	} while (r>0);

	pbc_rmessage_delete(message);
    free(files);
	return 0;
_error:
	pbc_rmessage_delete(message);
    free(files);
	return 1;
}
Пример #8
0
int
pbc_register(struct pbc_env * p, struct pbc_slice *slice) {
	struct pbc_rmessage * message = pbc_rmessage_new(p, "google.protobuf.FileDescriptorSet", slice);
	if (message == NULL) {
		p->lasterror = "register open google.protobuf.FileDescriptorSet fail";
		return 1;
	}
	int n = pbc_rmessage_size(message, "file");
	struct pbc_rmessage * files[n];
	int i;
	if (n == 0) {
		p->lasterror = "register empty";
		goto _error;
	}
	for (i=0;i<n;i++) {
		files[i] = pbc_rmessage_message(message, "file", i);
		if (files[i] == NULL) {
			p->lasterror = "register open fail";
			goto _error;
		}
	}

	int r = n;
	do {
		int rr = _register_no_dependency(p,files , n);
		if (rr == r) {
			p->lasterror = "register dependency error";
			goto _error;
		}
		r = rr;
	} while (r>0);

	pbc_rmessage_delete(message);
	return 0;
_error:
	pbc_rmessage_delete(message);
	return 1;
}
Пример #9
0
static void
test(struct pbc_env *env) {
	// int i;
	// for(i=0; i<COUNT; i++)
	// {
	// 		struct pbc_wmessage* w_msg = pbc_wmessage_new(env, "at");
	// 		struct pbc_rmessage* r_msg = NULL;
	// 		struct pbc_slice sl;
	// 		char buffer[1024];
	// 		sl.buffer = buffer, sl.len = 1024;
	// 		pbc_wmessage_integer(w_msg, "aa", 123, 0);
	// 		pbc_wmessage_integer(w_msg, "bb", 456, 0);
	// 		pbc_wmessage_string(w_msg, "cc", "test string!", 0);
	// 		pbc_wmessage_buffer(w_msg, &sl);

	// 		r_msg = pbc_rmessage_new(env, "at", &sl);

	// 		//===============
	// 		printf("aa = %d\n", pbc_rmessage_integer(r_msg, "aa", 0, NULL));
	// 		printf("bb = %d\n", pbc_rmessage_integer(r_msg, "bb", 0, NULL));
	// 		printf("cc = %s\n", pbc_rmessage_string(r_msg, "cc", 0, NULL));
	// 		//===============

	// 		pbc_rmessage_delete(r_msg);
	// 		pbc_wmessage_delete(w_msg);
	// }

	struct pbc_wmessage *w_msg = pbc_wmessage_new(env, "at");
	pbc_wmessage_integer(w_msg, "aa", 123, 0);
	pbc_wmessage_integer(w_msg, "bb", 456, 0); 
	pbc_wmessage_string(w_msg, "cc", "test string!", 0);

	struct pbc_slice slice;
	char buffer[1024];
	slice.len = 1024;
	slice.buffer = buffer;
	pbc_wmessage_buffer(w_msg, &slice);
	// pbc_wmessage_delete(w_msg);  //!!!!!!!! should not delete here

	struct pbc_rmessage *r_msg = pbc_rmessage_new(env, "at", &slice);
	printf("aa = %d\n", pbc_rmessage_integer(r_msg, "aa", 0, NULL));
	printf("bb = %d\n", pbc_rmessage_integer(r_msg, "bb", 0, NULL));
	printf("cc = %s\n", pbc_rmessage_string(r_msg, "cc", 0, NULL));

	pbc_rmessage_delete(r_msg);
	pbc_wmessage_delete(w_msg);
}
static int launch_container_internal(hadoop_rpc_proxy_t* proxy,
    containers_launch_context_t* launch_context, int container_id) {

    char* request = NULL;
    int request_len;
    int rc;

    rc = generate_launch_container_request(&request, &request_len, proxy, container_id, launch_context);
    if (0 != rc) {
        opal_output(0, "generate launch_container_request failed.\n");
        return -1;
    }

    // send request
    rc = send_rpc_request(proxy, request, request_len);
    if (0 != rc) {
        opal_output(0, "send launch_container_request failed.\n");
        free(request);
        return -1;
    }

    // now we will not use it anymore
    free(request);
    struct pbc_slice slice;

    // read response
    response_type_t response_type;
    response_type = recv_rpc_response(proxy, (char**)(&(slice.buffer)), &(slice.len));
    if (RESPONSE_SUCCEED == response_type) {
        // read response
        struct pbc_rmessage* rmsg = pbc_rmessage_new(env, "StartContainerResponseProto", &slice);
        if (!rmsg) {
            opal_output(0, "deserialize StartContainerResponseProto from buffer failed.\n");
            free(slice.buffer);
            return -1;
        }
        pbc_rmessage_delete(rmsg);
        free(slice.buffer);
        return 0;
    } else {
        process_bad_rpc_response(proxy, response_type);
        return -1;
    }
}
Пример #11
0
static int
_clear_gcobj(lua_State *L) {
	struct gcobj * obj = (struct gcobj *)lua_touserdata(L,1);
	int i;
	for (i=0;i<obj->size_pat;i++) {
		pbc_pattern_delete(obj->pat[i]);
	}
	for (i=0;i<obj->size_msg;i++) {
		pbc_rmessage_delete(obj->msg[i]);
	}
	free(obj->pat);
	free(obj->msg);
	obj->pat = NULL;
	obj->msg = NULL;
	pbc_delete(obj->env);
	obj->env = NULL;

	return 0;
}
Пример #12
0
static void
test(struct pbc_env *env) {
	int i;
	for(i=0; i<COUNT; i++)
	{
			struct pbc_wmessage* w_msg = pbc_wmessage_new(env, "at");
			struct pbc_rmessage* r_msg = NULL;
			struct pbc_slice sl;
			char buffer[1024];
			sl.buffer = buffer, sl.len = 1024;
			pbc_wmessage_integer(w_msg, "aa", 123, 0);
			pbc_wmessage_integer(w_msg, "bb", 456, 0);
			pbc_wmessage_string(w_msg, "cc", "test string!", -1);
			pbc_wmessage_buffer(w_msg, &sl);
					
			r_msg = pbc_rmessage_new(env, "at", &sl);
			pbc_rmessage_delete(r_msg);
			pbc_wmessage_delete(w_msg);
	} 
}
Пример #13
0
static void
test_rmessage(struct pbc_env *env, struct pbc_slice *slice) {
	struct pbc_rmessage * m = pbc_rmessage_new(env, "test", slice);
	if (m==NULL) {
		printf("Error : %s",pbc_error(env));
		return;
	}

	int phone_n = pbc_rmessage_size(m, "el");
	int i;


	for (i=0;i<phone_n;i++) {
		struct pbc_rmessage * p = pbc_rmessage_message(m , "el", i);
		printf("\tint16_min[%d] = %d\n",i,pbc_rmessage_integer(p , "int16_min", i ,NULL));
		printf("\tdouble_max[%d] = %f\n",i,pbc_rmessage_real(p, "double_max", i));
		printf("\tstring[%d] = %s\n",i,pbc_rmessage_string(p, "str", i, NULL));
	}

	pbc_rmessage_delete(m);
}
Пример #14
0
static int common_launch_process(orte_job_t *jdata, bool launch_daemon, int *launched_proc_num)
{
	int i, rc;
	orte_proc_t* proc = NULL;
	char **argv;
	int argc;
	char **env;
	bool error_flag = false;
	int launched_num = 0;

	/* 1. create launch message */
	/*
	 message LaunchRequestProto {
	 repeated LaunchContextProto launch_contexts = 1;
	 }

	 message LaunchContextProto {
	 repeated string envars = 1;
	 optional string args = 2;
	 optional string host_name = 3;
	 optional ProcessNameProto name = 4;
	 }

	 message ProcessNameProto {
	 optional int32 jobid = 1;
	 optional int32 vpid = 2;
	 }
	 */
	struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "LaunchRequestProto");
	if (!request_msg) {
		opal_output(0, "%s plm:yarn:common_process_launch: failed to create AllocateRequestProto",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		return ORTE_ERROR;
	}

	/* when launch_daemon, start from 1 because we don't need launch HNP process */
	i = launch_daemon ? 1 : 0;

	for (; i < jdata->num_procs; i++) {
		argv = NULL;
		argc = 0;
		env = NULL;
		/* setup env/argv  */
		proc = opal_pointer_array_get_item(jdata->procs, i);
		if (!proc) {
			opal_output(0, "%s plm:yarn:common_launch_process: proc[%d] is NULL",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i);
			ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
		}

		if (launch_daemon) {
			rc = setup_daemon_proc_env_and_argv(proc, &argv, &argc, &env);
		} else {
			orte_app_context_t* app = (orte_app_context_t*) opal_pointer_array_get_item(jdata->apps, proc->app_idx);
			rc = setup_proc_env_and_argv(jdata, app, proc, &argv, &env);
		}
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_launch_process: failed to setup env/argv of proc[%d]",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i);
			ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
			error_flag = true;
			goto cleanup;
		}

		 /* print launch commandline and env when this env is specified */
		if (getenv("HAMSTER_VERBOSE")) {

			char* join_argv = opal_argv_join(argv, ' ');

			OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output, "%s plm:yarn:common_launch_process: launch argv=%s",
							ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), join_argv));
			if (join_argv) {
				free(join_argv);
			}
		}

		OPAL_OUTPUT_VERBOSE((5, orte_plm_globals.output,
									"%s plm:yarn:common_launch_process: after setup env and argv for proc=%d.",
									ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), i));

		/* now start packing request_msg */
		struct pbc_wmessage *launch_contexts_msg = pbc_wmessage_message(request_msg, "launch_contexts");
		if (!launch_contexts_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: create launch_contexts_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		char **tmp_env = env;
		while (*tmp_env) {
			pbc_wmessage_string(launch_contexts_msg, "envars", *tmp_env, strlen(*tmp_env));
			tmp_env++;
		}

		char* join_argv = opal_argv_join(argv, ' ');
		pbc_wmessage_string(launch_contexts_msg, "args", join_argv, strlen(join_argv));

		pbc_wmessage_string(launch_contexts_msg, "host_name", proc->node->name, strlen(proc->node->name));

		struct pbc_wmessage *proccess_name_msg = pbc_wmessage_message(launch_contexts_msg, "name");
		if (!proccess_name_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: create proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		rc = pbc_wmessage_integer(proccess_name_msg, "jobid", ORTE_LOCAL_JOBID(proc->name.jobid), 0);
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: pack jobid in proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

		rc = pbc_wmessage_integer(proccess_name_msg, "vpid", proc->name.vpid,
				0);
		if (0 != rc) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: pack vpid in proccess_name_msg failed",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			error_flag = true;
			goto cleanup;
		}

cleanup:
		/* free argv and env for this proc */
		if (argv) {
			opal_argv_free(argv);
		}
		if (env) {
			opal_argv_free(env);
		}
		if (join_argv) {
			free(join_argv);
		}
		if (error_flag) {
			pbc_wmessage_delete(request_msg);
			ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
			return ORTE_ERROR;
		}
	}

	/* 2. send launch deamon procs request msg */
	rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_LAUNCH);
	if (rc != 0) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: error happened when send launch proc request to AM",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		if (request_msg) {
			pbc_wmessage_delete(request_msg);
		}
		return ORTE_ERROR;
	}

	/* 3. recv response and parse the msg*/
	/*
	 message LaunchResponseProto {
	 repeated LaunchResultProto results = 1;
	 }

	 message LaunchResultProto {
	 optional ProcessNameProto name = 1;
	 optional bool success = 2;
	 }

	 message ProcessNameProto {
	 optional int32 jobid = 1;
	 optional int32 vpid = 2;
	 }
	 */
	struct pbc_rmessage* response_msg = NULL;
	response_msg = orte_hdclient_recv_message("LaunchResponseProto");
	if (!response_msg) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: error happened when recv launch response msg from AM",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
		goto launch_failed;
	}

	int n = pbc_rmessage_size(response_msg, "results");
	if (n < 0) {
		opal_output(0,
				"%s plm:yarn:common_process_launch: got n(=%d) < 0, please check",
				ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n);
		goto launch_failed;
	}

	for (i = 0; i < n; i++) {
		struct pbc_rmessage* results_msg = pbc_rmessage_message(response_msg, "results", i);
		if (!results_msg) {
			opal_output(0,
					"%s plm:yarn:launch_daemons: error when parse returned launch results from AM",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			goto launch_failed;
		}

		struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(results_msg, "name", 0);
		if (!proc_name_msg) {
			opal_output(0,
					"%s plm:yarn:common_process_launch: error when parse returned proc_name_msg from AM",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
			goto launch_failed;
		}

		orte_jobid_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL);
		orte_vpid_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL);

		bool success = pbc_rmessage_integer(results_msg, "success", 0, NULL);

		orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(jdata->procs, vpid);
		if (success) {
			proc->state = ORTE_PROC_STATE_RUNNING;
			launched_num++;
		} else {
			opal_output(0,
					"%s plm:yarn:common_process_launch: launch proc failed when jobid = %u, vpid = %u",
					ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), local_jobid, vpid);
			proc->state = ORTE_PROC_STATE_FAILED_TO_START;
			jdata->state = ORTE_JOB_STATE_FAILED_TO_START;
			goto launch_failed;
		}
	}

	/* to return back */
	*launched_proc_num = launched_num;
	return ORTE_SUCCESS;

launch_failed:
	    if (response_msg) {
	        pbc_rmessage_delete(response_msg);
	    }
	    ORTE_UPDATE_EXIT_STATUS(ORTE_ERROR_DEFAULT_EXIT_CODE);
	    return ORTE_ERROR;
}
Пример #15
0
static void heartbeat_with_AM_cb(int fd, short event, void *data)
{
    int i, rc;
    orte_job_t *jdata = (orte_job_t*)data;
    orte_job_t* daemons = orte_get_job_data_object(ORTE_PROC_MY_NAME->jobid);

    /* 1. create heartbeat request msg */
    /*
    message HeartbeatRequestProto {
    }
    */
    struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "HeartbeatRequestProto");
    if (!request_msg) {
        opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb: failed to create request_msg",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }

    /* 2. send heartbeat request msg */
    rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_HEARTBEAT);
    if (rc != 0) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: error happened when send request_msg to AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        ORTE_ERROR_LOG(ORTE_ERROR_DEFAULT_EXIT_CODE);
        return;
    }

    /* 3. recv response and parse the msg*/
    /*
     message HeartbeatResponseProto {
         repeated ProcessStatusProto completed_processes = 1;
     }

     message ProcessStatusProto {
         optional ProcessNameProto name = 1;
         optional ProcessStateProto state = 2;
         optional int32 exit_value = 3;
     }

     enum ProcessStateProto {
         RUNNING = 1;
         COMPLETED = 2;
     }

     message ProcessNameProto {
         optional int32 jobid = 1;
         optional int32 vpid = 2;
     }
     */

    struct pbc_rmessage* response_msg = orte_hdclient_recv_message("HeartbeatResponseProto");
    if (!response_msg) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: error happened when recv HeartbeatResponseProto msg from AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    int n = pbc_rmessage_size(response_msg, "completed_processes");
    if (n < 0) {
        opal_output(0,
                "%s plm:yarn:heartbeat_with_AM_cb: got n(=%d) < 0, please check",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), n);
        goto cleanup;
    }

    for (i = 0; i < n; i++) {
        struct pbc_rmessage* completed_procs_msg = pbc_rmessage_message(response_msg, "completed_processes", i);
        if (!completed_procs_msg) {
            opal_output(0,
                    "%s plm:yarn:heartbeat_with_AM_cb: error when parse returned completed_procs_msg from AM",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            goto cleanup;
        }

        struct pbc_rmessage* proc_name_msg = pbc_rmessage_message(completed_procs_msg, "name", 0);
        if (!proc_name_msg) {
            opal_output(0,
                    "%s plm:yarn:heartbeat_with_AM_cb: error when parse proc_name_msg",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            goto cleanup;
        }

        uint32_t local_jobid = pbc_rmessage_integer(proc_name_msg, "jobid", 0, NULL);
        uint32_t vpid = pbc_rmessage_integer(proc_name_msg, "vpid", 0, NULL);

        uint32_t exit_value = pbc_rmessage_integer(completed_procs_msg, "exit_value", 0, NULL);

        /* next, we will modify proc's state */
        orte_job_t* tmp_jdata = (orte_job_t*) opal_pointer_array_get_item(orte_job_data, local_jobid);
        orte_proc_t* proc = (orte_proc_t*) opal_pointer_array_get_item(tmp_jdata->procs, vpid);


        if (tmp_jdata->jobid == jdata->jobid) {
			num_completed_jdata_procs++;
		}

        if (exit_value == 0) {
        	proc->state = ORTE_PROC_STATE_TERMINATED;
        }

        /* if this process is already terminated, just skip over */
        if (proc->state >= ORTE_PROC_STATE_TERMINATED) {
            continue;
        }

        if (exit_value == -1000 || exit_value == -100 || exit_value == -101) {
            opal_output(0, "%s plm:yarn:heartbeat_with_AM_cb proc failed to start", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
            ORTE_ERROR_LOG(ORTE_ERROR);
            proc->state = ORTE_PROC_STATE_FAILED_TO_START;
            ORTE_ACTIVATE_PROC_STATE(&proc->name, ORTE_PROC_STATE_FAILED_TO_LAUNCH);
        } else {
            /* here, means currently the proc's state < ORTE_PROC_STATE_TERMINATED,
             * however, from AM's heartbeat response, we got the proc's container is terminated,
             * to solve this dilemma , we set a timer event to reconfirm this proc's state,
             */
            opal_event_t *ev = NULL;
            ev = (opal_event_t*) malloc(sizeof(opal_event_t));

            struct timeval delay;
            delay.tv_sec = 15;
            delay.tv_usec = 0;

            opal_event_evtimer_set(orte_event_base, ev, process_state_monitor_cb, proc);
            opal_event_evtimer_add(ev, &delay);
        }
    }

cleanup:
    if (response_msg) {
        pbc_rmessage_delete(response_msg);
    }

    if (num_completed_jdata_procs == jdata->num_procs) {
        /*
         * all procs are completed, send finish request to AM,
         * modify job state to ORTE_JOB_STATE_TERMINATED
         */
        jdata->state = ORTE_JOB_STATE_TERMINATED;
        finish_app_master(0 == orte_exit_status);
        return;
    } else {
        /* next heartbeat */
        opal_event_t *ev = NULL;
        ev = (opal_event_t*) malloc(sizeof(opal_event_t));

        struct timeval delay;
        delay.tv_sec = 1;
        delay.tv_usec = 0;

        opal_event_evtimer_set(orte_event_base, ev, heartbeat_with_AM_cb, jdata);
		opal_event_evtimer_add(ev, &delay);
    }
}
Пример #16
0
static void finish_app_master(bool succeed)
{
    int rc;
    int i, j;
    char *diag_msg = "finish_app_master";

    if (appmaster_finished) {
    	return;
    }

    // we need double check if any proc failed
    if (succeed) {
    	/* start with 1 because we don't want to check daemon's proc */
        for (i = 1; i < orte_job_data->size; i++) {
            orte_job_t* job = opal_pointer_array_get_item(orte_job_data, i);
            if (!job) {
                continue;
            }
            for (j = 0; j < job->procs->size; j++) {
                orte_proc_t* proc = opal_pointer_array_get_item(job->procs, j);
                if (!proc) {
                    continue;
                }
                // if any process is non-terminated, we will consider it's error
                if (proc->state < ORTE_PROC_STATE_TERMINATED) {
                    succeed = false;
                    break; /* break the inner 'for' loop */
                }
            }
            if (!succeed) {
                break;  /* break the outer 'for' loop */
            }
        }
    }

    /* 1. create launch message */
    /*
    message FinishRequestProto {
        optional bool succeed = 1;
        optional string diagnostics = 2;
    }
    */
    struct pbc_wmessage* request_msg = pbc_wmessage_new(orte_hdclient_pb_env, "FinishRequestProto");

    if (!request_msg) {
        opal_output(0, "%s plm:yarn:finish_app_master: failed to create request_msg",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }



    rc = pbc_wmessage_integer(request_msg, "succeed", succeed, 0);
    if (0 != rc) {
        opal_output(0,
                "%s plm:yarn:finish_app_master: pack succeed in request_msg failed",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    rc = pbc_wmessage_string(request_msg, "diagnostics", diag_msg, strlen(diag_msg));
    if (0 != rc) {
        opal_output(0,
                "%s plm:yarn:finish_app_master: pack diagnostics in request_msg failed",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    /* 2. send launch procs request msg */
    rc = orte_hdclient_send_message_and_delete(request_msg, HAMSTER_MSG_FINISH);
    if (rc != 0) {
        opal_output(0,
                "%s plm:yarn:finish_app_master: error happened when send request_msg to AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    /* 3. recv response and parse the msg*/
    /*
     message FinishResponseProto {
     }
     */
    struct pbc_rmessage* response_msg = orte_hdclient_recv_message("FinishResponseProto");
    if (!response_msg) {
        opal_output(0,
                "%s plm:yarn:finish_app_master: error happened when recv response_msg from AM",
                ORTE_NAME_PRINT(ORTE_PROC_MY_NAME));
        goto cleanup;
    }

    appmaster_finished = true;

cleanup:
    if (response_msg) {
        pbc_rmessage_delete(response_msg);
    }
}