void HandleMPIDisconnect( int procID, const char *pReason ) { int nLiveWorkers = VMPI_GetCurrentNumberOfConnections() - g_nDisconnects - 1; // We ran into the size limit before and it wasn't readily apparent that the size limit had // been breached, so make sure to show errors about invalid packet sizes.. bool bOldSuppress = g_bSuppressPrintfOutput; g_bSuppressPrintfOutput = ( Q_stristr( pReason, "invalid packet size" ) == 0 ); Warning( "\n\n--- WARNING: lost connection to '%s' (%s).\n", VMPI_GetMachineName( procID ), pReason ); if ( g_bMPIMaster ) { Warning( "%d workers remain.\n\n", nLiveWorkers ); ++g_nDisconnects; /* if ( VMPI_GetCurrentNumberOfConnections() - g_nDisconnects <= 1 ) { Error( "All machines disconnected!" ); } */ } else { VMPI_HandleAutoRestart(); Error( "Worker quitting." ); } g_bSuppressPrintfOutput = bOldSuppress; }
//-------------------------------------------------- // UnSerialize face data // void UnSerializeFace( MessageBuffer * pmb, int facenum, int iSource ) { int i, n; dface_t * f = &g_pFaces[facenum]; facelight_t * fl = &facelight[facenum]; if (pmb->read(f, sizeof(dface_t)) < 0) Error("UnSerializeFace - invalid dface_t from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() ); if (pmb->read(fl, sizeof(facelight_t)) < 0) Error("UnSerializeFace - invalid facelight_t from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() ); fl->sample = (sample_t *) calloc(fl->numsamples, sizeof(sample_t)); if (pmb->read(fl->sample, sizeof(sample_t) * fl->numsamples) < 0) Error("UnSerializeFace - invalid sample_t from %s (mb len: %d, offset: %d, fl->numsamples: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset(), fl->numsamples ); // // Read the light information // for (i=0; i<MAXLIGHTMAPS; ++i) { for (n=0; n<NUM_BUMP_VECTS+1; ++n) { if (fl->light[i][n]) { fl->light[i][n] = (LightingValue_t *) calloc( fl->numsamples, sizeof(LightingValue_t ) ); if ( ReadValues( pmb, fl->light[i][n], fl->numsamples) < 0) Error("UnSerializeFace - invalid fl->light from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() ); } } } if (fl->luxel) { fl->luxel = (Vector *) calloc(fl->numluxels, sizeof(Vector)); if (ReadValues( pmb, fl->luxel, fl->numluxels) < 0) Error("UnSerializeFace - invalid fl->luxel from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() ); } if (fl->luxelNormals) { fl->luxelNormals = (Vector *) calloc(fl->numluxels, sizeof( Vector )); if ( ReadValues( pmb, fl->luxelNormals, fl->numluxels) < 0) Error("UnSerializeFace - invalid fl->luxelNormals from %s (mb len: %d, offset: %d)", VMPI_GetMachineName( iSource ), pmb->getLen(), pmb->getOffset() ); } }
void ShowMPIStats( double flTimeSpent, unsigned long nBytesSent, unsigned long nBytesReceived, unsigned long nMessagesSent, unsigned long nMessagesReceived ) { double flKSent = (nBytesSent + 511) / 1024; double flKRecv = (nBytesReceived + 511) / 1024; bool bShowOutput = VMPI_IsParamUsed( mpi_ShowDistributeWorkStats ); bool bOldSuppress = g_bSuppressPrintfOutput; g_bSuppressPrintfOutput = !bShowOutput; Msg( "\n\n--------------------------------------------------------------\n"); Msg( "Total Time : %.2f\n", flTimeSpent ); Msg( "Total Bytes Sent : %dk (%.2fk/sec, %d messages)\n", (int)flKSent, flKSent / flTimeSpent, nMessagesSent ); Msg( "Total Bytes Recv : %dk (%.2fk/sec, %d messages)\n", (int)flKRecv, flKRecv / flTimeSpent, nMessagesReceived ); if ( g_bMPIMaster ) { Msg( "Duplicated WUs : %I64u (%.1f%%)\n", g_nDuplicatedWUs, (float)g_nDuplicatedWUs * 100.0f / g_nWUs ); Msg( "\nWU count by proc:\n" ); int nProcs = VMPI_GetCurrentNumberOfConnections(); CUtlVector<int> sortedProcs; sortedProcs.SetSize( nProcs ); for ( int i=0; i < nProcs; i++ ) sortedProcs[i] = i; qsort( sortedProcs.Base(), nProcs, sizeof( int ), SortByWUCount ); for ( int i=0; i < nProcs; i++ ) { const char *pMachineName = VMPI_GetMachineName( sortedProcs[i] ); Msg( "%s", pMachineName ); char formatStr[512]; Q_snprintf( formatStr, sizeof( formatStr ), "%%%ds %I64u\n", 30 - strlen( pMachineName ), g_wuCountByProcess[ sortedProcs[i] ] ); Msg( formatStr, ":" ); } } Msg( "--------------------------------------------------------------\n\n "); g_bSuppressPrintfOutput = bOldSuppress; }
bool DistributeWorkDispatch( MessageBuffer *pBuf, int iSource, int iPacketID ) { unsigned short iCurDistributeWorkCall = *((unsigned short*)&pBuf->data[2]); if ( iCurDistributeWorkCall >= MAX_DW_CALLS ) Error( "Got an invalid DistributeWork packet (id: %d, sub: %d) (iCurDW: %d).", pBuf->data[0], pBuf->data[1], iCurDistributeWorkCall ); CDSInfo *pInfo = &g_DSInfo; pBuf->setOffset( 4 ); switch ( pBuf->data[1] ) { case DW_SUBPACKETID_MASTER_READY: { g_iMasterReadyForDistributeWorkCall = iCurDistributeWorkCall; return true; } case DW_SUBPACKETID_WORKER_READY: { if ( iCurDistributeWorkCall > g_iCurDSInfo || !g_bMPIMaster ) Error( "State incorrect on master for DW_SUBPACKETID_WORKER_READY packet from %s.", VMPI_GetMachineName( iSource ) ); if ( iCurDistributeWorkCall == g_iCurDSInfo ) { // Ok, give this guy some WUs. if ( g_pCurDistributorMaster ) g_pCurDistributorMaster->OnWorkerReady( iSource ); } return true; } case DW_SUBPACKETID_MASTER_FINISHED: { g_iMasterFinishedDistributeWorkCall = iCurDistributeWorkCall; return true; } // Worker sends this to tell the master it has started on a work unit. case DW_SUBPACKETID_WU_STARTED: { if ( iCurDistributeWorkCall != g_iCurDSInfo ) return true; WUIndexType iWU; pBuf->read( &iWU, sizeof( iWU ) ); VMPITracker_WorkUnitStarted( ( int ) iWU, iSource ); return true; } case DW_SUBPACKETID_WU_RESULTS: { // We only care about work results for the iteration we're in. if ( iCurDistributeWorkCall != g_iCurDSInfo ) return true; WUIndexType iWorkUnit; pBuf->read( &iWorkUnit, sizeof( iWorkUnit ) ); if ( iWorkUnit >= pInfo->m_nWorkUnits ) { Error( "DistributeWork: got an invalid work unit index (%I64u for WU count of %I64u).", iWorkUnit, pInfo->m_nWorkUnits ); } HandleWorkUnitCompleted( pInfo, iSource, iWorkUnit, pBuf ); return true; } default: { if ( g_pCurDistributorMaster ) return g_pCurDistributorMaster->HandlePacket( pBuf, iSource, iCurDistributeWorkCall != g_iCurDSInfo ); else if ( g_pCurDistributorWorker ) return g_pCurDistributorWorker->HandlePacket( pBuf, iSource, iCurDistributeWorkCall != g_iCurDSInfo ); else return false; } } }
bool VMPITracker_WriteDebugFile( const char *pFilename ) { FILE *fp = fopen( pFilename, "wt" ); if ( fp ) { fprintf( fp, "# work units: %d\n", g_WorkUnits.Count() ); fprintf( fp, "# active work units: %d\n", CountActiveWorkUnits() ); fprintf( fp, "\n" ); fprintf( fp, "--- Events ---" ); fprintf( fp, "\n" ); fprintf( fp, "\n" ); for ( int i=0; i < g_WorkUnits.Count(); i++ ) { CWorkUnit *wu = &g_WorkUnits[i]; if ( wu->m_iWorkerCompleted != -1 ) continue; fprintf( fp, " work unit %d\n", i ); fprintf( fp, "\n" ); if ( wu->m_Events.Count() == 0 ) { fprintf( fp, " *no events*\n" ); } else { for ( int iEvent=0; iEvent < wu->m_Events.Count(); iEvent++ ) { CWorkUnitEvent *pEvent = &wu->m_Events[iEvent]; if ( pEvent->m_iEventType == EVENT_TYPE_WU_STARTED ) { fprintf( fp, " started (by worker %s) %.1f seconds ago\n", VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ), Plat_FloatTime() - wu->m_Events[iEvent].m_flTime ); } else if ( pEvent->m_iEventType == EVENT_TYPE_SEND_WORK_UNIT ) { fprintf( fp, " sent (to worker %s) %.1f seconds ago\n", VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ), Plat_FloatTime() - wu->m_Events[iEvent].m_flTime ); } else if ( pEvent->m_iEventType == EVENT_TYPE_WU_COMPLETED ) { fprintf( fp, " completed (by worker %s) %.1f seconds ago\n", VMPI_GetMachineName( wu->m_Events[iEvent].m_iWorker ), Plat_FloatTime() - wu->m_Events[iEvent].m_flTime ); } } } fprintf( fp, "\n" ); } fclose( fp ); return true; } else { return false; } }
bool SharedDispatch( MessageBuffer *pBuf, int iSource, int iPacketID ) { char *pInPos = &pBuf->data[2]; switch ( pBuf->data[1] ) { case VMPI_SUBPACKETID_DIRECTORIES: { Q_strncpy( gamedir, pInPos, sizeof( gamedir ) ); pInPos += strlen( pInPos ) + 1; Q_strncpy( qdir, pInPos, sizeof( qdir ) ); g_bReceivedDirectoryInfo = true; } return true; case VMPI_SUBPACKETID_DBINFO: { g_DBInfo = *((CDBInfo*)pInPos); pInPos += sizeof( CDBInfo ); g_JobPrimaryID = *((unsigned long*)pInPos); g_bReceivedDBInfo = true; } return true; case VMPI_SUBPACKETID_CRASH: { char const chCrashInfoType = *pInPos; pInPos += 2; switch ( chCrashInfoType ) { case 't': Warning( "\nWorker '%s' dead: %s\n", VMPI_GetMachineName( iSource ), pInPos ); break; case 'f': { int iFileSize = * reinterpret_cast< int const * >( pInPos ); pInPos += sizeof( iFileSize ); // Temp folder char const *szFolder = NULL; if ( !szFolder ) szFolder = getenv( "TEMP" ); if ( !szFolder ) szFolder = getenv( "TMP" ); if ( !szFolder ) szFolder = "c:"; // Base module name char chModuleName[_MAX_PATH], *pModuleName = chModuleName; ::GetModuleFileName( NULL, chModuleName, sizeof( chModuleName ) / sizeof( chModuleName[0] ) ); if ( char *pch = strrchr( chModuleName, '.' ) ) *pch = 0; if ( char *pch = strrchr( chModuleName, '\\' ) ) *pch = 0, pModuleName = pch + 1; // Current time time_t currTime = ::time( NULL ); struct tm * pTime = ::localtime( &currTime ); // Number of minidumps this run static int s_numMiniDumps = 0; ++ s_numMiniDumps; // Prepare the filename char chSaveFileName[ 2 * _MAX_PATH ] = { 0 }; sprintf( chSaveFileName, "%s\\vmpi_%s_on_%s_%d%.2d%2d%.2d%.2d%.2d_%d.mdmp", szFolder, pModuleName, VMPI_GetMachineName( iSource ), pTime->tm_year + 1900, /* Year less 2000 */ pTime->tm_mon + 1, /* month (0 - 11 : 0 = January) */ pTime->tm_mday, /* day of month (1 - 31) */ pTime->tm_hour, /* hour (0 - 23) */ pTime->tm_min, /* minutes (0 - 59) */ pTime->tm_sec, /* seconds (0 - 59) */ s_numMiniDumps ); if ( FILE *fDump = fopen( chSaveFileName, "wb" ) ) { fwrite( pInPos, 1, iFileSize, fDump ); fclose( fDump ); Warning( "\nSaved worker crash minidump '%s', size %d byte(s).\n", chSaveFileName, iFileSize ); } else { Warning( "\nReceived worker crash minidump size %d byte(s), failed to save.\n", iFileSize ); } } break; } } return true; } return false; }