void HandleMPIDisconnect( int procID, const char *pReason )
{
	int nLiveWorkers = VMPI_GetCurrentNumberOfConnections() - g_nDisconnects - 1;

	// We ran into the size limit before and it wasn't readily apparent that the size limit had
	// been breached, so make sure to show errors about invalid packet sizes..
	bool bOldSuppress = g_bSuppressPrintfOutput;
	g_bSuppressPrintfOutput = ( Q_stristr( pReason, "invalid packet size" ) == 0 );

		Warning( "\n\n--- WARNING: lost connection to '%s' (%s).\n", VMPI_GetMachineName( procID ), pReason );
		
		if ( g_bMPIMaster )
		{
			Warning( "%d workers remain.\n\n", nLiveWorkers );

			++g_nDisconnects;
			/*
			if ( VMPI_GetCurrentNumberOfConnections() - g_nDisconnects <= 1 )
			{
				Error( "All machines disconnected!" );
			}
			*/
		}
		else
		{
			VMPI_HandleAutoRestart();
			Error( "Worker quitting." );
		}
	
	g_bSuppressPrintfOutput = bOldSuppress;
}
示例#2
0
//--------------------------------------------------
// UnSerialize face data
//
void UnSerializeFace( MessageBuffer * pmb, int facenum, int iSource )
{
	int i, n;

	dface_t     * f  = &g_pFaces[facenum];
	facelight_t * fl = &facelight[facenum];

	if (pmb->read(f, sizeof(dface_t)) < 0) 
		Error("UnSerializeFace - invalid dface_t from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() );

	if (pmb->read(fl, sizeof(facelight_t)) < 0) 
		Error("UnSerializeFace - invalid facelight_t from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() );

	fl->sample = (sample_t *) calloc(fl->numsamples, sizeof(sample_t));
	if (pmb->read(fl->sample, sizeof(sample_t) * fl->numsamples) < 0) 
		Error("UnSerializeFace - invalid sample_t from %s (mb len: %d, offset: %d, fl->numsamples: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset(), fl->numsamples );

	//
	// Read the light information
	// 
	for (i=0; i<MAXLIGHTMAPS; ++i) {
		for (n=0; n<NUM_BUMP_VECTS+1; ++n) {
			if (fl->light[i][n])
			{
				fl->light[i][n] = (LightingValue_t *) calloc( fl->numsamples, sizeof(LightingValue_t ) );
				if ( ReadValues( pmb, fl->light[i][n], fl->numsamples) < 0)
					Error("UnSerializeFace - invalid fl->light from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() );
			}
		}
	}

	if (fl->luxel) {
		fl->luxel = (Vector *) calloc(fl->numluxels, sizeof(Vector));
		if (ReadValues( pmb, fl->luxel, fl->numluxels) < 0)
			Error("UnSerializeFace - invalid fl->luxel from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() );
	}

	if (fl->luxelNormals) {
		fl->luxelNormals = (Vector *) calloc(fl->numluxels, sizeof( Vector ));
		if ( ReadValues( pmb, fl->luxelNormals, fl->numluxels) < 0)
			Error("UnSerializeFace - invalid fl->luxelNormals from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() );
	}

}
void ShowMPIStats(
    double flTimeSpent,
    unsigned long nBytesSent,
    unsigned long nBytesReceived,
    unsigned long nMessagesSent,
    unsigned long nMessagesReceived )
{
    double flKSent = (nBytesSent + 511) / 1024;
    double flKRecv = (nBytesReceived + 511) / 1024;

    bool bShowOutput = VMPI_IsParamUsed( mpi_ShowDistributeWorkStats );

    bool bOldSuppress = g_bSuppressPrintfOutput;
    g_bSuppressPrintfOutput = !bShowOutput;

    Msg( "\n\n--------------------------------------------------------------\n");
    Msg( "Total Time       : %.2f\n", flTimeSpent );
    Msg( "Total Bytes Sent : %dk (%.2fk/sec, %d messages)\n", (int)flKSent, flKSent / flTimeSpent, nMessagesSent );
    Msg( "Total Bytes Recv : %dk (%.2fk/sec, %d messages)\n", (int)flKRecv, flKRecv / flTimeSpent, nMessagesReceived );
    if ( g_bMPIMaster )
    {
        Msg( "Duplicated WUs   : %I64u (%.1f%%)\n", g_nDuplicatedWUs, (float)g_nDuplicatedWUs * 100.0f / g_nWUs );

        Msg( "\nWU count by proc:\n" );

        int nProcs = VMPI_GetCurrentNumberOfConnections();

        CUtlVector<int> sortedProcs;
        sortedProcs.SetSize( nProcs );
        for ( int i=0; i < nProcs; i++ )
            sortedProcs[i] = i;

        qsort( sortedProcs.Base(), nProcs, sizeof( int ), SortByWUCount );

        for ( int i=0; i < nProcs; i++ )
        {
            const char *pMachineName = VMPI_GetMachineName( sortedProcs[i] );
            Msg( "%s", pMachineName );

            char formatStr[512];
            Q_snprintf( formatStr, sizeof( formatStr ), "%%%ds %I64u\n", 30 - strlen( pMachineName ), g_wuCountByProcess[ sortedProcs[i] ] );
            Msg( formatStr, ":" );
        }
    }
    Msg( "--------------------------------------------------------------\n\n ");

    g_bSuppressPrintfOutput = bOldSuppress;
}
bool DistributeWorkDispatch( MessageBuffer *pBuf, int iSource, int iPacketID )
{
    unsigned short iCurDistributeWorkCall = *((unsigned short*)&pBuf->data[2]);
    if ( iCurDistributeWorkCall >= MAX_DW_CALLS )
        Error( "Got an invalid DistributeWork packet (id: %d, sub: %d) (iCurDW: %d).", pBuf->data[0], pBuf->data[1], iCurDistributeWorkCall );

    CDSInfo *pInfo = &g_DSInfo;

    pBuf->setOffset( 4 );

    switch ( pBuf->data[1] )
    {
    case DW_SUBPACKETID_MASTER_READY:
    {
        g_iMasterReadyForDistributeWorkCall = iCurDistributeWorkCall;
        return true;
    }

    case DW_SUBPACKETID_WORKER_READY:
    {
        if ( iCurDistributeWorkCall > g_iCurDSInfo || !g_bMPIMaster )
            Error( "State incorrect on master for DW_SUBPACKETID_WORKER_READY packet from %s.", VMPI_GetMachineName( iSource ) );

        if ( iCurDistributeWorkCall == g_iCurDSInfo )
        {
            // Ok, give this guy some WUs.
            if ( g_pCurDistributorMaster )
                g_pCurDistributorMaster->OnWorkerReady( iSource );
        }

        return true;
    }

    case DW_SUBPACKETID_MASTER_FINISHED:
    {
        g_iMasterFinishedDistributeWorkCall = iCurDistributeWorkCall;
        return true;
    }

    // Worker sends this to tell the master it has started on a work unit.
    case DW_SUBPACKETID_WU_STARTED:
    {
        if ( iCurDistributeWorkCall != g_iCurDSInfo )
            return true;

        WUIndexType iWU;
        pBuf->read( &iWU, sizeof( iWU ) );
        VMPITracker_WorkUnitStarted( ( int ) iWU, iSource );
        return true;
    }


    case DW_SUBPACKETID_WU_RESULTS:
    {
        // We only care about work results for the iteration we're in.
        if ( iCurDistributeWorkCall != g_iCurDSInfo )
            return true;

        WUIndexType iWorkUnit;
        pBuf->read( &iWorkUnit, sizeof( iWorkUnit ) );
        if ( iWorkUnit >= pInfo->m_nWorkUnits )
        {
            Error( "DistributeWork: got an invalid work unit index (%I64u for WU count of %I64u).", iWorkUnit, pInfo->m_nWorkUnits );
        }

        HandleWorkUnitCompleted( pInfo, iSource, iWorkUnit, pBuf );
        return true;
    }

    default:
    {
        if ( g_pCurDistributorMaster )
            return g_pCurDistributorMaster->HandlePacket( pBuf, iSource, iCurDistributeWorkCall != g_iCurDSInfo );
        else if ( g_pCurDistributorWorker )
            return g_pCurDistributorWorker->HandlePacket( pBuf, iSource, iCurDistributeWorkCall != g_iCurDSInfo );
        else
            return false;
    }
    }
}
bool VMPITracker_WriteDebugFile( const char *pFilename )
{
	FILE *fp = fopen( pFilename, "wt" );
	if ( fp )
	{
		fprintf( fp, "# work units: %d\n", g_WorkUnits.Count() );
		fprintf( fp, "# active work units: %d\n", CountActiveWorkUnits() );
		
		fprintf( fp, "\n" );
		fprintf( fp, "--- Events ---" );
		fprintf( fp, "\n" );
		fprintf( fp, "\n" );
		
		for ( int i=0; i < g_WorkUnits.Count(); i++ )
		{
			CWorkUnit *wu = &g_WorkUnits[i];
			
			if ( wu->m_iWorkerCompleted != -1 )
				continue;

			fprintf( fp, "  work unit %d\n", i );
			fprintf( fp, "\n" );
						
			if ( wu->m_Events.Count() == 0 )
			{
				fprintf( fp, "    *no events*\n" );
			}
			else
			{
				for ( int iEvent=0; iEvent < wu->m_Events.Count(); iEvent++ )
				{
					CWorkUnitEvent *pEvent = &wu->m_Events[iEvent];
					
					if ( pEvent->m_iEventType == EVENT_TYPE_WU_STARTED )
					{
						fprintf( fp, "   started (by worker %s) %.1f seconds ago\n", 
							VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ),
							Plat_FloatTime() - wu->m_Events[iEvent].m_flTime );
					}
					else if ( pEvent->m_iEventType == EVENT_TYPE_SEND_WORK_UNIT )
					{
						fprintf( fp, "      sent (to worker %s) %.1f seconds ago\n", 
							VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ),
							Plat_FloatTime() - wu->m_Events[iEvent].m_flTime );
					}
					else if ( pEvent->m_iEventType == EVENT_TYPE_WU_COMPLETED )
					{
						fprintf( fp, " completed (by worker %s) %.1f seconds ago\n", 
							VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ),
							Plat_FloatTime() - wu->m_Events[iEvent].m_flTime );
					}
				}
			}
			fprintf( fp, "\n" );
		}
		
		fclose( fp );
		return true;
	}
	else
	{
		return false;
	}	
}
bool SharedDispatch( MessageBuffer *pBuf, int iSource, int iPacketID )
{
	char *pInPos = &pBuf->data[2];

	switch ( pBuf->data[1] )
	{
		case VMPI_SUBPACKETID_DIRECTORIES:
		{
			Q_strncpy( gamedir, pInPos, sizeof( gamedir ) );
			pInPos += strlen( pInPos ) + 1;

			Q_strncpy( qdir, pInPos, sizeof( qdir ) );
			
			g_bReceivedDirectoryInfo = true;
		}
		return true;

		case VMPI_SUBPACKETID_DBINFO:
		{
			g_DBInfo = *((CDBInfo*)pInPos);
			pInPos += sizeof( CDBInfo );
			g_JobPrimaryID = *((unsigned long*)pInPos);

			g_bReceivedDBInfo = true;
		}
		return true;

		case VMPI_SUBPACKETID_CRASH:
		{
			char const chCrashInfoType = *pInPos;
			pInPos += 2;
			switch ( chCrashInfoType )
			{
			case 't':
				Warning( "\nWorker '%s' dead: %s\n", VMPI_GetMachineName( iSource ), pInPos );
				break;
			case 'f':
				{
					int iFileSize = * reinterpret_cast< int const * >( pInPos );
					pInPos += sizeof( iFileSize );

					// Temp folder
					char const *szFolder = NULL;
					if ( !szFolder ) szFolder = getenv( "TEMP" );
					if ( !szFolder ) szFolder = getenv( "TMP" );
					if ( !szFolder ) szFolder = "c:";

					// Base module name
					char chModuleName[_MAX_PATH], *pModuleName = chModuleName;
					::GetModuleFileName( NULL, chModuleName, sizeof( chModuleName ) / sizeof( chModuleName[0] ) );

					if ( char *pch = strrchr( chModuleName, '.' ) )
						*pch = 0;
					if ( char *pch = strrchr( chModuleName, '\\' ) )
						*pch = 0, pModuleName = pch + 1;

					// Current time
					time_t currTime = ::time( NULL );
					struct tm * pTime = ::localtime( &currTime );

					// Number of minidumps this run
					static int s_numMiniDumps = 0;
					++ s_numMiniDumps;

					// Prepare the filename
					char chSaveFileName[ 2 * _MAX_PATH ] = { 0 };
					sprintf( chSaveFileName, "%s\\vmpi_%s_on_%s_%d%.2d%2d%.2d%.2d%.2d_%d.mdmp",
						szFolder,
						pModuleName,
						VMPI_GetMachineName( iSource ),
						pTime->tm_year + 1900,	/* Year less 2000 */
						pTime->tm_mon + 1,		/* month (0 - 11 : 0 = January) */
						pTime->tm_mday,			/* day of month (1 - 31) */
						pTime->tm_hour,			/* hour (0 - 23) */
						pTime->tm_min,		    /* minutes (0 - 59) */
						pTime->tm_sec,		    /* seconds (0 - 59) */
						s_numMiniDumps
						);

					if ( FILE *fDump = fopen( chSaveFileName, "wb" ) )
					{
						fwrite( pInPos, 1, iFileSize, fDump );
						fclose( fDump );

						Warning( "\nSaved worker crash minidump '%s', size %d byte(s).\n",
							chSaveFileName, iFileSize );
					}
					else
					{
						Warning( "\nReceived worker crash minidump size %d byte(s), failed to save.\n", iFileSize );
					}
				}
				break;
			}
		}
		return true;
	}

	return false;
}