Ejemplo n.º 1
0
	virtual int RunCommand()
	{
		int nCurConnections = VMPI_GetCurrentNumberOfConnections();


		// Update the NumWorkers entry.
		char query[2048];
		Q_snprintf( query, sizeof( query ), "update job_master_start set NumWorkers=%d where JobID=%lu",
			nCurConnections,
			g_JobPrimaryID );
		g_pSQL->Execute( query );

		
		// Update the job_master_worker_stats stuff.
		for ( int i=1; i < nCurConnections; i++ )
		{
			unsigned long jobWorkerID = VMPI_GetJobWorkerID( i );

			if ( jobWorkerID != 0xFFFFFFFF )
			{
				Q_snprintf( query, sizeof( query ), "update "
					"job_worker_start set WorkerState=%d, NumWorkUnits=%d where JobWorkerID=%lu",
					VMPI_IsProcConnected( i ), 
					(int) VMPI_GetNumWorkUnitsCompleted( i ),
					VMPI_GetJobWorkerID( i )
					); 
				g_pSQL->Execute( query );
			}
		}
		return 1;
	}
Ejemplo n.º 2
0
void HandleMPIDisconnect( int procID, const char *pReason )
{
	int nLiveWorkers = VMPI_GetCurrentNumberOfConnections() - g_nDisconnects - 1;

	// We ran into the size limit before and it wasn't readily apparent that the size limit had
	// been breached, so make sure to show errors about invalid packet sizes..
	bool bOldSuppress = g_bSuppressPrintfOutput;
	g_bSuppressPrintfOutput = ( Q_stristr( pReason, "invalid packet size" ) == 0 );

		Warning( "\n\n--- WARNING: lost connection to '%s' (%s).\n", VMPI_GetMachineName( procID ), pReason );
		
		if ( g_bMPIMaster )
		{
			Warning( "%d workers remain.\n\n", nLiveWorkers );

			++g_nDisconnects;
			/*
			if ( VMPI_GetCurrentNumberOfConnections() - g_nDisconnects <= 1 )
			{
				Error( "All machines disconnected!" );
			}
			*/
		}
		else
		{
			VMPI_HandleAutoRestart();
			Error( "Worker quitting." );
		}
	
	g_bSuppressPrintfOutput = bOldSuppress;
}
void ShowMPIStats(
    double flTimeSpent,
    unsigned long nBytesSent,
    unsigned long nBytesReceived,
    unsigned long nMessagesSent,
    unsigned long nMessagesReceived )
{
    double flKSent = (nBytesSent + 511) / 1024;
    double flKRecv = (nBytesReceived + 511) / 1024;

    bool bShowOutput = VMPI_IsParamUsed( mpi_ShowDistributeWorkStats );

    bool bOldSuppress = g_bSuppressPrintfOutput;
    g_bSuppressPrintfOutput = !bShowOutput;

    Msg( "\n\n--------------------------------------------------------------\n");
    Msg( "Total Time       : %.2f\n", flTimeSpent );
    Msg( "Total Bytes Sent : %dk (%.2fk/sec, %d messages)\n", (int)flKSent, flKSent / flTimeSpent, nMessagesSent );
    Msg( "Total Bytes Recv : %dk (%.2fk/sec, %d messages)\n", (int)flKRecv, flKRecv / flTimeSpent, nMessagesReceived );
    if ( g_bMPIMaster )
    {
        Msg( "Duplicated WUs   : %I64u (%.1f%%)\n", g_nDuplicatedWUs, (float)g_nDuplicatedWUs * 100.0f / g_nWUs );

        Msg( "\nWU count by proc:\n" );

        int nProcs = VMPI_GetCurrentNumberOfConnections();

        CUtlVector<int> sortedProcs;
        sortedProcs.SetSize( nProcs );
        for ( int i=0; i < nProcs; i++ )
            sortedProcs[i] = i;

        qsort( sortedProcs.Base(), nProcs, sizeof( int ), SortByWUCount );

        for ( int i=0; i < nProcs; i++ )
        {
            const char *pMachineName = VMPI_GetMachineName( sortedProcs[i] );
            Msg( "%s", pMachineName );

            char formatStr[512];
            Q_snprintf( formatStr, sizeof( formatStr ), "%%%ds %I64u\n", 30 - strlen( pMachineName ), g_wuCountByProcess[ sortedProcs[i] ] );
            Msg( formatStr, ":" );
        }
    }
    Msg( "--------------------------------------------------------------\n\n ");

    g_bSuppressPrintfOutput = bOldSuppress;
}