bool Job::AddParent( Job* parent, MyString &whynot ) { if( !this->CanAddParent( parent, whynot ) ) { return false; } if( HasParent( parent ) ) { debug_printf( DEBUG_QUIET, "Warning: child %s already has parent %s\n", GetJobName(), parent->GetJobName() ); check_warning_strictness( DAG_STRICT_3 ); return true; } if( !Add( Q_PARENTS, parent->GetJobID() ) ) { whynot = "unknown error appending to PARENTS queue"; return false; } if( parent->GetStatus() != STATUS_DONE ) { if( !Add( Q_WAITING, parent->GetJobID() ) ) { // this node's dependency queues are now out of sync and // thus the DAG state is FUBAR, so we should bail... EXCEPT( "Failed to add parent %s to job %s", parent->GetJobName(), GetJobName() ); return false; } } whynot = "n/a"; return true; }
bool Job::SetStatus( status_t newStatus ) { debug_printf( DEBUG_DEBUG_1, "Job(%s)::_Status = %s\n", GetJobName(), status_t_names[_Status] ); debug_printf( DEBUG_DEBUG_1, "Job(%s)::SetStatus(%s)\n", GetJobName(), status_t_names[newStatus] ); _Status = newStatus; // TODO: add some state transition sanity-checking here? return true; }
/* ======================== idParallelJobList_Threads::AddJob ======================== */ ID_INLINE void idParallelJobList_Threads::AddJob( jobRun_t function, void* data ) { assert( done ); #if defined( _DEBUG ) // make sure there isn't already a job with the same function and data in the list if( jobList.Num() < 1000 ) // don't do this N^2 slow check on big lists { for( int i = 0; i < jobList.Num(); i++ ) { assert( jobList[i].function != function || jobList[i].data != data ); } } #endif if( 1 ) // JDC: this never worked in tech5! !jobList.IsFull() ) { { job_t& job = jobList.Alloc(); job.function = function; job.data = data; job.executed = 0; } else { // debug output to show us what is overflowing int currentJobCount[MAX_REGISTERED_JOBS] = {}; for( int i = 0; i < jobList.Num(); ++i ) { const char* jobName = GetJobName( jobList[ i ].function ); for( int j = 0; j < numRegisteredJobs; ++j ) { if( jobName == registeredJobs[ j ].name ) { currentJobCount[ j ]++; break; } } } // print the quantity of each job type for( int i = 0; i < numRegisteredJobs; ++i ) { if( currentJobCount[ i ] > 0 ) { idLib::Printf( "Job: %s, # %d", registeredJobs[ i ].name, currentJobCount[ i ] ); } } idLib::Error( "Can't add job '%s', too many jobs %d", GetJobName( function ), jobList.Num() ); } }
void Job::SetCategory( const char *categoryName, ThrottleByCategory &catThrottles ) { MyString tmpName( categoryName ); if ( (_throttleInfo != NULL) && (tmpName != *(_throttleInfo->_category)) ) { debug_printf( DEBUG_NORMAL, "Warning: new category %s for node %s " "overrides old value %s\n", categoryName, GetJobName(), _throttleInfo->_category->Value() ); check_warning_strictness( DAG_STRICT_3 ); } // Note: we must assign a ThrottleInfo here even if the name // already matches, for the case of lifting splices. ThrottleByCategory::ThrottleInfo *oldInfo = _throttleInfo; ThrottleByCategory::ThrottleInfo *throttleInfo = catThrottles.GetThrottleInfo( &tmpName ); if ( throttleInfo != NULL ) { _throttleInfo = throttleInfo; } else { _throttleInfo = catThrottles.AddCategory( &tmpName ); } if ( oldInfo != _throttleInfo ) { if ( oldInfo != NULL ) { oldInfo->_totalJobs--; } _throttleInfo->_totalJobs++; } }
wxString BuildTag::ToString() const { wxString str = wxString::Format(_T("%s-%s-%i"), GetBuildType().c_str(), GetJobName().c_str(), GetBuildNum()); return str; }
// iterate across the Job's var values, and for any which have $(JOB) in them, // substitute it. This substitution is draconian and will always happen. void Job::ResolveVarsInterpolations(void) { NodeVar *var; varsFromDag->Rewind(); while( (var = varsFromDag->Next()) != NULL ) { // XXX No way to escape $(JOB) in case, for some crazy reason, you // want a filename component actually to be '$(JOB)'. // It isn't hard to fix, I'll do it later. var->_value.replaceString("$(JOB)", GetJobName()); } }
//--------------------------------------------------------------------------- bool Job::SetCondorID(const CondorID& cid) { bool ret = true; if(GetCluster() != -1) { debug_printf( DEBUG_NORMAL, "Reassigning the id of job %s from (%d.%d.%d) to " "(%d.%d.%d)\n", GetJobName(), GetCluster(), GetProc(), GetSubProc(), cid._cluster, cid._proc,cid._subproc ); ret = false; } _CondorID = cid; return ret; }
//--------------------------------------------------------------------------- bool Job::UnmonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader ) { debug_printf( DEBUG_DEBUG_2, "Unmonitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); if ( !_logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already unmonitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; debug_printf( DEBUG_DEBUG_1, "Unmonitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; bool result = logReader.unmonitorLogFile( GetLogFile(), errstack ); if ( !result ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to unmonitor log " "file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); EXCEPT( "Fatal log file monitoring error!\n" ); } if ( result ) { delete [] _logFile; _logFile = NULL; _logIsMonitored = false; } return result; }
//--------------------------------------------------------------------------- bool Job::Release(int proc) { if( proc >= static_cast<int>( _onHold.size() ) ) { dprintf( D_FULLDEBUG, "Received release event for node %s, but job %d.%d " "is not on hold\n", GetJobName(), GetCluster(), GetProc() ); return false; // We never marked this as being on hold } if( _onHold[proc] ) { _onHold[proc] = 0; --_jobProcsOnHold; return true; } return false; }
//--------------------------------------------------------------------------- bool Job::Hold(int proc) { if( proc >= static_cast<int>( _onHold.size() ) ) { _onHold.resize( proc+1, 0 ); } if( !_onHold[proc] ) { _onHold[proc] = 1; ++_jobProcsOnHold; ++_timesHeld; return true; } else { dprintf( D_FULLDEBUG, "Received hold event for node %s, and job %d.%d " "is already on hold!\n", GetJobName(), GetCluster(), proc ); } return false; }
//--------------------------------------------------------------------------- void Job::TermAbortMetrics( int proc, const struct tm &eventTime, DagmanMetrics *metrics ) { if ( proc >= static_cast<int>( _gotEvents.size() ) ) { debug_printf( DEBUG_NORMAL, "Warning for node %s: got terminated or aborted event for proc %d, but no execute event!\n", GetJobName(), proc ); check_warning_strictness( DAG_STRICT_2 ); _gotEvents.resize( proc+1, 0 ); } if ( !( _gotEvents[proc] & ABORT_TERM_MASK ) ) { _gotEvents[proc] |= ABORT_TERM_MASK; metrics->ProcFinished( eventTime ); } }
//--------------------------------------------------------------------------- void Job::Cleanup() { std::vector<unsigned char> s; _onHold.swap(s); // Free memory in _onHold for ( int proc = 0; proc < static_cast<int>( _gotEvents.size() ); proc++ ) { if ( _gotEvents[proc] != ( EXEC_MASK | ABORT_TERM_MASK ) ) { debug_printf( DEBUG_NORMAL, "Warning for node %s: unexpected _gotEvents value for proc %d: %d!\n", GetJobName(), proc, (int)_gotEvents[proc] ); check_warning_strictness( DAG_STRICT_2 ); } } std::vector<unsigned char> s2; _gotEvents.swap(s2); // Free memory in _gotEvents }
// OutputToStream() // Outputs an ImageData instance as ASCII text to the stream provided. void ImageData::OutputToStream( ostream &out ) { out << " Filename: \"" << GetFilename() << "\"" << endl; out << " File Format: " << GetFileFormat() << "; \"" << ImageFileFormatStrings[ GetFileFormat() ] << "\"" << endl; out << " File Size: " << GetFileSize() << " bytes " << endl; out << endl; out << " File Data:" << endl; out << " Bits Per Pixel: " << GetBitsPerPixel() << endl; out << " Compression: " << GetCompression() << "; \"" << ImageCompressionModeStrings[ GetCompression() ] << "\"" << endl; out << " DPI X/Y: " << GetDPIX() << "/" << GetDPIY() << endl; out << " Pixel Aspect Ration: " << GetPixelAspectNum() << "/" << GetPixelAspectDenom() << endl; out << " Gamma Correction: " << GetGammaNum() << "/" << GetGammaDenom() << endl; out << " Thumbnail: "; if( GetThumbnail() == NULL ) out << "No" << endl; else out << "Yes" << endl; out << endl; out << " Creator Data:" << endl; out << " Author: \"" << GetAuthor() << "\"" << endl; out << " Creator Program: \"" << GetCreator() << "\"" << endl; out << " Creator Version: " << GetCreatorVersion() << "." << GetCreatorRevision() << GetCreatorSubRev() << endl; out << " Comment: \"" << GetComment() << "\"" << endl; out << " Job Name: \"" << GetJobName() << "\"" << endl; out << " Job Time: " << GetJobTime()[0] << ":" << GetJobTime()[1] << ":" << GetJobTime()[2] << endl; out << " Date Stamp: " << GetDateStamp()[0] << "/" << GetDateStamp()[1] << "/" << GetDateStamp()[2] << " " << GetDateStamp()[3] << ":" << GetDateStamp()[4] << ":" << GetDateStamp()[5] << endl; out << endl; out << " Image Data:" << endl; out << " Width: " << GetImage()->GetWidth() << endl; out << " Height: " << GetImage()->GetHeight() << endl; out << " Type: " << GetImage()->GetType() << "; \"" << ImageTypeStrings[ GetImage()->GetType() ] << "\"" << endl; out << " Num Registers: "; if( GetImage()->GetType() == IMAGE_INDEXED ) out << GetImage()->GetNumRegisters() << endl; else out << "N/A" << endl; }
bool Job::AddChild( Job* child, MyString &whynot ) { if( !this->CanAddChild( child, whynot ) ) { return false; } if( HasChild( child ) ) { debug_printf( DEBUG_NORMAL, "Warning: parent %s already has child %s\n", GetJobName(), child->GetJobName() ); check_warning_strictness( DAG_STRICT_3 ); return true; } if( !Add( Q_CHILDREN, child->GetJobID() ) ) { whynot = "unknown error appending to CHILDREN queue"; return false; } whynot = "n/a"; return true; }
LONG SaveJob( _In_ PJOB pJob) { SCHEDULE Schedule; HKEY hJobsKey = NULL, hJobKey = NULL; LONG lError; TRACE("SaveJob()\n"); lError = RegCreateKeyExW(HKEY_LOCAL_MACHINE, L"System\\CurrentControlSet\\Services\\Schedule\\Jobs", 0, NULL, REG_OPTION_NON_VOLATILE, KEY_WRITE, NULL, &hJobsKey, NULL); if (lError != ERROR_SUCCESS) goto done; GetJobName(hJobsKey, pJob->Name); lError = RegCreateKeyExW(hJobsKey, pJob->Name, 0, NULL, REG_OPTION_NON_VOLATILE, KEY_WRITE, NULL, &hJobKey, NULL); if (lError != ERROR_SUCCESS) goto done; Schedule.JobTime = pJob->JobTime; Schedule.DaysOfMonth = pJob->DaysOfMonth; Schedule.DaysOfWeek = pJob->DaysOfWeek; Schedule.Flags = pJob->Flags; lError = RegSetValueEx(hJobKey, L"Schedule", 0, REG_BINARY, (PBYTE)&Schedule, sizeof(Schedule)); if (lError != ERROR_SUCCESS) goto done; lError = RegSetValueEx(hJobKey, L"Command", 0, REG_SZ, (PBYTE)pJob->Command, (wcslen(pJob->Command) + 1) * sizeof(WCHAR)); if (lError != ERROR_SUCCESS) goto done; done: if (hJobKey != NULL) RegCloseKey(hJobKey); if (hJobsKey != NULL) RegCloseKey(hJobsKey); return lError; }
/* ======================== idParallelJobList_Threads::RunJobsInternal ======================== */ int idParallelJobList_Threads::RunJobsInternal( unsigned int threadNum, threadJobListState_t& state, bool singleJob ) { if( state.version != version.GetValue() ) { // trying to run an old version of this list that is already done return RUN_DONE; } assert( threadNum < MAX_THREADS ); if( deferredThreadStats.startTime == 0 ) { deferredThreadStats.startTime = Sys_Microseconds(); // first time any thread is running jobs from this list } int result = RUN_OK; do { // run through all signals and syncs before the last job that has been or is being executed // this loop is really an optimization to minimize the time spent in the fetchLock section below for( ; state.lastJobIndex < ( int ) currentJob.GetValue() && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ ) { if( jobList[state.lastJobIndex].data == & JOB_SIGNAL ) { state.signalIndex++; assert( state.signalIndex < signalJobCount.Num() ); } else if( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE ) { assert( state.signalIndex > 0 ); if( signalJobCount[state.signalIndex - 1].GetValue() > 0 ) { // stalled on a synchronization point return ( result | RUN_STALLED ); } } else if( jobList[state.lastJobIndex].data == & JOB_LIST_DONE ) { if( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 ) { // stalled on a synchronization point return ( result | RUN_STALLED ); } } } // try to lock to fetch a new job if( fetchLock.Increment() == 1 ) { // grab a new job state.nextJobIndex = currentJob.Increment() - 1; // run through any remaining signals and syncs (this should rarely iterate more than once) for( ; state.lastJobIndex <= state.nextJobIndex && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ ) { if( jobList[state.lastJobIndex].data == & JOB_SIGNAL ) { state.signalIndex++; assert( state.signalIndex < signalJobCount.Num() ); } else if( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE ) { assert( state.signalIndex > 0 ); if( signalJobCount[state.signalIndex - 1].GetValue() > 0 ) { // return this job to the list currentJob.Decrement(); // release the fetch lock fetchLock.Decrement(); // stalled on a synchronization point return ( result | RUN_STALLED ); } } else if( jobList[state.lastJobIndex].data == & JOB_LIST_DONE ) { if( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 ) { // return this job to the list currentJob.Decrement(); // release the fetch lock fetchLock.Decrement(); // stalled on a synchronization point return ( result | RUN_STALLED ); } // decrement the done count doneGuards[currentDoneGuard].Decrement(); } } // release the fetch lock fetchLock.Decrement(); } else { // release the fetch lock fetchLock.Decrement(); // another thread is fetching right now so consider stalled return ( result | RUN_STALLED ); } // if at the end of the job list we're done if( state.nextJobIndex >= jobList.Num() ) { return ( result | RUN_DONE ); } // execute the next job { uint64 jobStart = Sys_Microseconds(); jobList[state.nextJobIndex].function( jobList[state.nextJobIndex].data ); jobList[state.nextJobIndex].executed = 1; uint64 jobEnd = Sys_Microseconds(); deferredThreadStats.threadExecTime[threadNum] += jobEnd - jobStart; #ifndef _DEBUG if( jobs_longJobMicroSec.GetInteger() > 0 ) { if( jobEnd - jobStart > jobs_longJobMicroSec.GetInteger() && GetId() != JOBLIST_UTILITY ) { longJobTime = ( jobEnd - jobStart ) * ( 1.0f / 1000.0f ); longJobFunc = jobList[state.nextJobIndex].function; longJobData = jobList[state.nextJobIndex].data; const char* jobName = GetJobName( jobList[state.nextJobIndex].function ); const char* jobListName = GetJobListName( GetId() ); idLib::Printf( "%1.1f milliseconds for a single '%s' job from job list %s on thread %d\n", longJobTime, jobName, jobListName, threadNum ); } } #endif } result |= RUN_PROGRESS; // decrease the job count for the current signal if( signalJobCount[state.signalIndex].Decrement() == 0 ) { // if this was the very last job of the job list if( state.signalIndex == signalJobCount.Num() - 1 ) { deferredThreadStats.endTime = Sys_Microseconds(); return ( result | RUN_DONE ); } } } while( ! singleJob ); return result; }
//--------------------------------------------------------------------------- bool Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader, bool nfsIsError, bool recovery, const char *defaultNodeLog, bool usingDefault ) { debug_printf( DEBUG_DEBUG_2, "Attempting to monitor log file for node %s\n", GetJobName() ); if ( _logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already monitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; std::string logFileStr; if ( _jobType == TYPE_CONDOR ) { // We check to see if the user has specified a log file // If not, we give him a default MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile, _directory, _logFileIsXml, usingDefault); logFileStr = templogFileStr.Value(); } else { StringList logFiles; MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile( _cmdFile, _directory, logFiles ); if ( tmpResult != "" ) { debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n", tmpResult.Value() ); LogMonitorFailed(); return false; } else if ( logFiles.number() != 1 ) { debug_printf( DEBUG_QUIET, "Error: %d Stork log files found " "in submit file %s; we want 1\n", logFiles.number(), _cmdFile ); LogMonitorFailed(); return false; } else { logFiles.rewind(); logFileStr = logFiles.next(); } } // Warn the user if the node's log file is in /tmp. if ( logFileStr.find( "/tmp" ) == 0 ) { debug_printf( DEBUG_QUIET, "Warning: " "Log file %s for node %s is in /tmp\n", logFileStr.c_str(), GetJobName() ); check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 ); } if ( logFileStr == "" ) { logFileStr = defaultNodeLog; _useDefaultLog = true; // Default User log is never XML // This could be specified in the submit file and should be // ignored. _logFileIsXml = false; debug_printf( DEBUG_NORMAL, "Unable to get log file from " "submit file %s (node %s); using default (%s)\n", _cmdFile, GetJobName(), logFileStr.c_str() ); append_default_log = false; } else { append_default_log = usingDefault; if( append_default_log ) { // DAGman is not going to look at the user-specified log. // It will look at the defaultNode log. logFileStr = defaultNodeLog; _useDefaultLog = false; _logFileIsXml = false; } } // This function returns true if the log file is on NFS and // that is an error. If the log file is on NFS, but nfsIsError // is false, it prints a warning but returns false. if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(), nfsIsError ) ) { debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n", logFileStr.c_str() ); LogMonitorFailed(); return false; } delete [] _logFile; // Saving log file here in case submit file gets changed. _logFile = strnewp( logFileStr.c_str() ); debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to monitor log file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); LogMonitorFailed(); EXCEPT( "Fatal log file monitoring error!\n" ); return false; } _logIsMonitored = true; return true; }