Пример #1
0
static int ConnectToHost(char *host, int port, smpd_state_t state, SMPDU_Sock_set_t set, SMPDU_Sock_t *sockp, smpd_context_t **contextpp)
{
    int result;
    char error_msg[SMPD_MAX_ERROR_LEN];
    int len;

    /*printf("posting a connect to %s:%d\n", host, port);fflush(stdout);*/
    result = smpd_create_context(SMPD_CONTEXT_PMI, set, SMPDU_SOCK_INVALID_SOCK/**sockp*/, -1, contextpp);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("ConnectToHost failed: unable to create a context to connect to %s:%d with.\n", host, port);
	return SMPD_FAIL;
    }

    result = SMPDU_Sock_post_connect(set, *contextpp, host, port, sockp);
    if (result != SMPD_SUCCESS)
    {
	len = SMPD_MAX_ERROR_LEN;
	PMPI_Error_string(result, error_msg, &len);
	smpd_err_printf("ConnectToHost failed: unable to post a connect to %s:%d, error: %s\n", host, port, error_msg);
	return SMPD_FAIL;
    }

    (*contextpp)->sock = *sockp;
    (*contextpp)->state = state;

    result = smpd_enter_at_state(set, state);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("ConnectToHost failed: unable to connect to %s:%d.\n", host, port);
	return SMPD_FAIL;
    }

    return SMPD_SUCCESS;
}
Пример #2
0
/*
  FUNCTION: smpd_service_stop

  PURPOSE: Stops the service

  PARAMETERS:
    none

  RETURN VALUE:
    none

  COMMENTS:
    If a ServiceStop procedure is going to
    take longer than 3 seconds to execute,
    it should spawn a thread to execute the
    stop code, and return.  Otherwise, the
    ServiceControlManager will believe that
    the service has stopped responding.
*/    
void smpd_service_stop()
{
    SMPDU_Sock_set_t set;
    SMPDU_Sock_t sock;
    SMPDU_Sock_event_t event;
    char host[SMPD_MAX_HOST_LENGTH];
    int iter;
    DWORD dwThreadID;
    int result;

    for (iter=0; iter<10; iter++)
    {
	smpd_process.hBombThread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)smpd_bomb_thread, NULL, 0, &dwThreadID);
	if (smpd_process.hBombThread != NULL)
	    break;
	Sleep(250);
    }

    /* stop the main thread */
    smpd_process.service_stop = SMPD_TRUE;
    smpd_get_hostname(host, SMPD_MAX_HOST_LENGTH);
    result = SMPDU_Sock_create_set(&set);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_create_set failed,\nsock error: %s\n", get_sock_error_string(result));
	SetEvent(smpd_process.hBombDiffuseEvent);
	WaitForSingleObject(smpd_process.hBombThread, (DWORD)3000);
	CloseHandle(smpd_process.hBombThread);
	ExitProcess((UINT)-1);
    }
    result = SMPDU_Sock_post_connect(set, NULL, host, smpd_process.port, &sock);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("Unable to connect to '%s:%d',\nsock error: %s\n",
	    smpd_process.host_list->host, smpd_process.port, get_sock_error_string(result));
	SetEvent(smpd_process.hBombDiffuseEvent);
	WaitForSingleObject(smpd_process.hBombThread, (DWORD)3000);
	CloseHandle(smpd_process.hBombThread);
	ExitProcess((UINT)-1);
    }
    result = SMPDU_Sock_wait(set, SMPDU_SOCK_INFINITE_TIME, &event);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("Unable to connect to '%s:%d',\nsock error: %s\n",
	    smpd_process.host_list->host, smpd_process.port, get_sock_error_string(result));
	SetEvent(smpd_process.hBombDiffuseEvent);
	WaitForSingleObject(smpd_process.hBombThread, (DWORD)3000);
	CloseHandle(smpd_process.hBombThread);
	ExitProcess((UINT)-1);
    }
}
Пример #3
0
int smpd_do_console()
{
    int result = -1;
    smpd_context_t *context;
    SMPDU_Sock_set_t set;
    SMPDU_Sock_t sock;
    SMPD_BOOL no_smpd = SMPD_FALSE;
    int saved_state = 0;
    int exit_code = 0;

    smpd_enter_fn(FCNAME);

    /* make sure we have a passphrase to authenticate connections to the smpds */
    if (smpd_process.passphrase[0] == '\0')
	smpd_get_smpd_data("phrase", smpd_process.passphrase, SMPD_PASSPHRASE_MAX_LENGTH);
    if (smpd_process.passphrase[0] == '\0')
    {
	if (smpd_process.noprompt)
	{
	    printf("Error: No smpd passphrase specified through the registry or .smpd file, exiting.\n");
	    goto quit_job;
	}
	printf("Please specify an authentication passphrase for smpd: ");
	fflush(stdout);
	smpd_get_password(smpd_process.passphrase);
    }

    result = SMPDU_Sock_create_set(&set);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_create_set failed,\nsock error: %s\n", get_sock_error_string(result));
	goto quit_job;
    }
    smpd_process.set = set;

    /* set the id of the mpiexec node to zero */
    smpd_process.id = 0;

    /* turn off output if do_status is selected to supress error messages */
    if (smpd_process.builtin_cmd == SMPD_CMD_DO_STATUS)
    {
	saved_state = smpd_process.dbg_state;
	smpd_process.dbg_state = 0;
    }

    /* start connecting the tree by posting a connect to the first host */
    result = smpd_create_context(SMPD_CONTEXT_LEFT_CHILD, set, SMPDU_SOCK_INVALID_SOCK/*sock*/, 1, &context);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("Unable to create a context.\n");
	goto quit_job;
    }

    result = SMPDU_Sock_post_connect(set, context, smpd_process.console_host, smpd_process.port, &sock);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("Unable to connect to '%s:%d',\nsock error: %s\n",
	    smpd_process.console_host, smpd_process.port, get_sock_error_string(result));
	no_smpd = SMPD_TRUE;
	goto quit_job;
    }
    context->sock = sock;

    /* turn output back on */
    if (smpd_process.builtin_cmd == SMPD_CMD_DO_STATUS)
	smpd_process.dbg_state = saved_state;

    context->state = SMPD_MPIEXEC_CONNECTING_SMPD;
    smpd_process.left_context = context;

    /* turn off output if do_status is selected to supress error messages */
    if (smpd_process.builtin_cmd == SMPD_CMD_DO_STATUS)
	smpd_process.dbg_state = 0;
    result = smpd_enter_at_state(set, SMPD_MPIEXEC_CONNECTING_SMPD);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("state machine failed.\n");
	no_smpd = SMPD_TRUE;
	goto quit_job;
    }
    /* turn output back on */
    if (smpd_process.builtin_cmd == SMPD_CMD_DO_STATUS)
	smpd_process.dbg_state = saved_state;

quit_job:

    if (result != SMPD_SUCCESS)
    {
	exit_code = result;
    }

    if (smpd_process.builtin_cmd == SMPD_CMD_DO_STATUS && (no_smpd || smpd_process.state_machine_ret_val != SMPD_SUCCESS))
    {
	printf("no smpd running on %s\n", smpd_process.console_host);
	smpd_process.dbg_state = saved_state;
    }

    if (smpd_process.do_console_returns == SMPD_TRUE)
    {
	smpd_exit_fn(FCNAME);
	return exit_code;
    }

    /* finalize */
    /*
    smpd_dbg_printf("calling SMPDU_Sock_finalize\n");
    result = SMPDU_Sock_finalize();
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_finalize failed,\nsock error: %s\n", get_sock_error_string(result));
    }
    */

#ifdef HAVE_WINDOWS_H
    if (smpd_process.hCloseStdinThreadEvent)
	SetEvent(smpd_process.hCloseStdinThreadEvent);
    if (smpd_process.hStdinThread != NULL)
    {
	/* close stdin so the input thread will exit */
	CloseHandle(GetStdHandle(STD_INPUT_HANDLE));
	if (WaitForSingleObject(smpd_process.hStdinThread, 3000) != WAIT_OBJECT_0)
	{
	    TerminateThread(smpd_process.hStdinThread, 321);
	}
	CloseHandle(smpd_process.hStdinThread);
    }
    if (smpd_process.hCloseStdinThreadEvent)
    {
	CloseHandle(smpd_process.hCloseStdinThreadEvent);
	smpd_process.hCloseStdinThreadEvent = NULL;
    }
#elif defined(USE_PTHREAD_STDIN_REDIRECTION)
    smpd_cancel_stdin_thread();
#endif
    smpd_exit_fn(FCNAME);
    smpd_exit(exit_code);
    return SMPD_SUCCESS;
}
Пример #4
0
int main(int argc, char* argv[])
{
    int result = SMPD_SUCCESS;
    smpd_host_node_t *host_node_ptr;
    smpd_launch_node_t *launch_node_ptr;
    smpd_context_t *context;
    SMPDU_Sock_set_t set;
    SMPDU_Sock_t sock = SMPDU_SOCK_INVALID_SOCK;
    smpd_state_t state;

    smpd_enter_fn("main");

    /* catch an empty command line */
    if (argc < 2)
    {
	mp_print_options();
	exit(0);
    }

    smpd_process.mpiexec_argv0 = argv[0];

    /* initialize */
    /* FIXME: Get rid of this hack - we already create 
     * local KVS for all singleton clients by default
     */
    putenv("PMI_SMPD_FD=0");
    result = PMPI_Init(&argc, &argv);
    /* SMPD_CS_ENTER(); */
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPD_Init failed,\nerror: %d\n", result);
	smpd_exit_fn("main");
	return result;
    }

    result = SMPDU_Sock_init();
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_init failed,\nsock error: %s\n",
		      get_sock_error_string(result));
	smpd_exit_fn("main");
	return result;
    }

    result = smpd_init_process();
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("smpd_init_process failed.\n");
	goto quit_job;
    }

    smpd_process.dbg_state = SMPD_DBG_STATE_ERROUT;

    /* parse the command line */
    smpd_dbg_printf("parsing the command line.\n");
    result = mp_parse_command_args(&argc, &argv);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("Unable to parse the mpiexec command arguments.\n");
	goto quit_job;
    }

    /* If we are using MS HPC job scheduler we only connect
     * to the local SMPD
     */
    if(smpd_process.use_ms_hpc){
        char host[100];
        int id;
        /* Free the current host list */
        result = smpd_free_host_list();
        if(result != SMPD_SUCCESS){
            smpd_err_printf("Unable to free the global host list\n");
            goto quit_job;
        }
        /* Add local host to the host list */
        result = smpd_get_hostname(host, 100);
        if(result != SMPD_SUCCESS){
            smpd_err_printf("Unable to get the local hostname\n");
            goto quit_job;
        }
	    result = smpd_get_host_id(host, &id);
        if(result != SMPD_SUCCESS){
            smpd_err_printf("Unable to get host id for local host\n");
            goto quit_job;
        }
        /* Set the number of PMI procs since they are not launched by mpiexec */
        smpd_process.nproc = smpd_process.launch_list->nproc;
        smpd_dbg_printf("Adding (%s:%d) == (localhost) to the host list\n", host, id);
    }

    /* print and see what we've got */
    /* debugging output *************/
    smpd_dbg_printf("host tree:\n");
    host_node_ptr = smpd_process.host_list;
    if (!host_node_ptr)
	smpd_dbg_printf("<none>\n");
    while (host_node_ptr)
    {
	smpd_dbg_printf(" host: %s, parent: %d, id: %d\n",
	    host_node_ptr->host,
	    host_node_ptr->parent, host_node_ptr->id);
	host_node_ptr = host_node_ptr->next;
    }
    smpd_dbg_printf("launch nodes:\n");
    launch_node_ptr = smpd_process.launch_list;
    if (!launch_node_ptr)
	smpd_dbg_printf("<none>\n");
    while (launch_node_ptr)
    {
	smpd_dbg_printf(" iproc: %d, id: %d, exe: %s\n",
	    launch_node_ptr->iproc, launch_node_ptr->host_id,
	    launch_node_ptr->exe);
	launch_node_ptr = launch_node_ptr->next;
    }
    /* end debug output *************/

    /* set the id of the mpiexec node to zero */
    smpd_process.id = 0;

    result = SMPDU_Sock_create_set(&set);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_create_set failed,\nsock error: %s\n", get_sock_error_string(result));
	goto quit_job;
    }
    smpd_process.set = set;

    /* Check to see if the user wants to use a remote shell mechanism for launching the processes
     * instead of using the smpd process managers.
     */
    if (smpd_process.rsh_mpiexec == SMPD_TRUE)
    {
	/* Do rsh or localonly stuff */
	result = mpiexec_rsh();

	/* skip over the non-rsh code and go to the cleanup section */
	goto quit_job;
    }

    /* Start the timeout mechanism if specified */
    /* This code occurs after the rsh_mpiexec option check because the rsh code implementes timeouts differently */
    if (smpd_process.timeout > 0)
    {
#ifdef HAVE_WINDOWS_H
	/* create a Windows thread to sleep until the timeout expires */
	if (smpd_process.timeout_thread == NULL)
	{
	    smpd_process.timeout_thread = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)timeout_thread, NULL, 0, NULL);
	    if (smpd_process.timeout_thread == NULL)
	    {
		printf("Error: unable to create a timeout thread, errno %d.\n", GetLastError());
		smpd_exit_fn("mp_parse_command_args");
		return SMPD_FAIL;
	    }
	}
#elif defined(SIGALRM)
	/* create an alarm to signal mpiexec when the timeout expires */
	smpd_signal(SIGALRM, timeout_function);
	alarm(smpd_process.timeout);
#elif defined(HAVE_PTHREAD_H)
	/* create a pthread to sleep until the timeout expires */
	result = pthread_create(&smpd_process.timeout_thread, NULL, timeout_thread, NULL);
	if (result != 0)
	{
	    printf("Error: unable to create a timeout thread, errno %d.\n", result);
	    smpd_exit_fn("mp_parse_command_args");
	    return SMPD_FAIL;
	}
#else
	/* no timeout mechanism available */
#endif
    }

    /* make sure we have a passphrase to authenticate connections to the smpds */
    if (smpd_process.passphrase[0] == '\0')
	smpd_get_smpd_data("phrase", smpd_process.passphrase, SMPD_PASSPHRASE_MAX_LENGTH);
    if (smpd_process.passphrase[0] == '\0')
    {
	if (smpd_process.noprompt)
	{
	    printf("Error: No smpd passphrase specified through the registry or .smpd file, exiting.\n");
	    result = SMPD_FAIL;
	    goto quit_job;
	}
	printf("Please specify an authentication passphrase for smpd: ");
	fflush(stdout);
	smpd_get_password(smpd_process.passphrase);
    }

    /* set the state to create a console session or a job session */
    state = smpd_process.do_console ? SMPD_MPIEXEC_CONNECTING_SMPD : SMPD_MPIEXEC_CONNECTING_TREE;

    result = smpd_create_context(SMPD_CONTEXT_LEFT_CHILD, set, sock, 1, &context);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("unable to create a context for the first host in the tree.\n");
	goto quit_job;
    }
#ifdef HAVE_WINDOWS_H
    if (!smpd_process.local_root)
    {
#endif
	/* start connecting the tree by posting a connect to the first host */
	result = SMPDU_Sock_post_connect(set, context, smpd_process.host_list->host, smpd_process.port, &sock);
	if (result != SMPD_SUCCESS)
	{
	    smpd_err_printf("Unable to connect to '%s:%d',\nsock error: %s\n",
		smpd_process.host_list->host, smpd_process.port, get_sock_error_string(result));
	    goto quit_job;
	}
#ifdef HAVE_WINDOWS_H
    }
#endif
    context->sock = sock;
    context->state = state;
    context->connect_to = smpd_process.host_list;
#ifdef HAVE_WINDOWS_H
    if (smpd_process.local_root)
    {
	int port;
	smpd_context_t *rc_context;

	/* The local_root option is implemented by having mpiexec act as the smpd
	 * and launch the smpd manager.  Then mpiexec connects to this manager just
	 * as if it had been created by a real smpd.  This causes all the processes
	 * destined for the first smpd host to be launched by this child process of
	 * mpiexec and not the smpd service.  This allows for these processes to
	 * create windows that are visible to the interactive user.  It also means 
	 * that the job cannot be run in the context of a user other than the user
	 * running mpiexec. */

	/* get the path to smpd.exe because pszExe is currently mpiexec.exe */
	smpd_get_smpd_data("binary", smpd_process.pszExe, SMPD_MAX_EXE_LENGTH);

	/* launch the manager process */
	result = smpd_start_win_mgr(context, SMPD_FALSE);
	if (result != SMPD_SUCCESS)
	{
	    smpd_err_printf("unable to start the local smpd manager.\n");
	    goto quit_job;
	}

	/* connect to the manager */
	smpd_dbg_printf("connecting a new socket.\n");
	port = atol(context->port_str);
	if (port < 1)
	{
	    smpd_err_printf("Invalid reconnect port read: %d\n", port);
	    goto quit_job;
	}
	result = smpd_create_context(context->type, context->set, SMPDU_SOCK_INVALID_SOCK, context->id, &rc_context);
	if (result != SMPD_SUCCESS)
	{
	    smpd_err_printf("unable to create a new context for the reconnection.\n");
	    goto quit_job;
	}
	rc_context->state = context->state;
	rc_context->write_state = SMPD_RECONNECTING;
	context->state = SMPD_CLOSING;
	rc_context->connect_to = context->connect_to;
	rc_context->connect_return_id = context->connect_return_id;
	rc_context->connect_return_tag = context->connect_return_tag;
	strcpy(rc_context->host, context->host);
	smpd_process.left_context = rc_context;
	smpd_dbg_printf("posting a re-connect to %s:%d in %s context.\n", rc_context->connect_to->host, port, smpd_get_context_str(rc_context));
	result = SMPDU_Sock_post_connect(rc_context->set, rc_context, rc_context->connect_to->host, port, &rc_context->sock);
	if (result != SMPD_SUCCESS)
	{
	    smpd_err_printf("Unable to post a connect to '%s:%d',\nsock error: %s\n",
		rc_context->connect_to->host, port, get_sock_error_string(result));
	    if (smpd_post_abort_command("Unable to connect to '%s:%d',\nsock error: %s\n",
		rc_context->connect_to->host, port, get_sock_error_string(result)) != SMPD_SUCCESS)
	    {
		goto quit_job;
	    }
	}
    }
    else
    {
#endif
	smpd_process.left_context = context;
	result = SMPDU_Sock_set_user_ptr(sock, context);
	if (result != SMPD_SUCCESS)
	{
	    smpd_err_printf("unable to set the smpd sock user pointer,\nsock error: %s\n",
		get_sock_error_string(result));
	    goto quit_job;
	}
#ifdef HAVE_WINDOWS_H
    }
#endif

#ifdef HAVE_WINDOWS_H
    {
	/* Create a break handler and a socket to handle aborting the job when mpiexec receives break signals */
	smpd_context_t *reader_context;
	SMPDU_Sock_t sock_reader;
	SMPDU_SOCK_NATIVE_FD reader, writer;

	smpd_make_socket_loop((SOCKET*)&reader, (SOCKET*)&writer);
	result = SMPDU_Sock_native_to_sock(set, reader, NULL, &sock_reader);
	result = SMPDU_Sock_native_to_sock(set, writer, NULL, &smpd_process.mpiexec_abort_sock);
	result = smpd_create_context(SMPD_CONTEXT_MPIEXEC_ABORT, set, sock_reader, -1, &reader_context);
	reader_context->read_state = SMPD_READING_MPIEXEC_ABORT;
	result = SMPDU_Sock_post_read(sock_reader, &reader_context->read_cmd.cmd, 1, 1, NULL);

	if (!SetConsoleCtrlHandler(mpiexec_ctrl_handler, TRUE))
	{
	    /* Don't error out; allow the job to run without a ctrl handler? */
	    result = GetLastError();
	    smpd_dbg_printf("unable to set a ctrl handler for mpiexec, error %d\n", result);
	}
    }
#endif

    result = smpd_enter_at_state(set, state);
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("state machine failed.\n");
	goto quit_job;
    }

quit_job:

    if ((result != SMPD_SUCCESS) && (smpd_process.mpiexec_exit_code == 0))
    {
	smpd_process.mpiexec_exit_code = -1;
    }

    /* finalize */

    smpd_dbg_printf("calling SMPDU_Sock_finalize\n");
    result = SMPDU_Sock_finalize();
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPDU_Sock_finalize failed,\nsock error: %s\n", get_sock_error_string(result));
    }

    /* SMPD_Finalize called in smpd_exit()
    smpd_dbg_printf("calling SMPD_Finalize\n");
    result = PMPI_Finalize();
    if (result != SMPD_SUCCESS)
    {
	smpd_err_printf("SMPD_Finalize failed,\nerror: %d\n", result);
    }
    */

#ifdef HAVE_WINDOWS_H
    if (smpd_process.hCloseStdinThreadEvent)
	SetEvent(smpd_process.hCloseStdinThreadEvent);
    if (smpd_process.hStdinThread != NULL)
    {
	/* close stdin so the input thread will exit */
	CloseHandle(GetStdHandle(STD_INPUT_HANDLE));
	if (WaitForSingleObject(smpd_process.hStdinThread, 3000) != WAIT_OBJECT_0)
	{
	    TerminateThread(smpd_process.hStdinThread, 321);
	}
	CloseHandle(smpd_process.hStdinThread);
    }
    if (smpd_process.hCloseStdinThreadEvent)
    {
	CloseHandle(smpd_process.hCloseStdinThreadEvent);
	smpd_process.hCloseStdinThreadEvent = NULL;
    }
#elif defined(USE_PTHREAD_STDIN_REDIRECTION)
    smpd_cancel_stdin_thread();
#endif
    smpd_exit_fn("main");
    /* SMPD_CS_EXIT(); */
    return smpd_exit(smpd_process.mpiexec_exit_code);
}