Beispiel #1
0
bool
Job::AddParent( Job* parent, MyString &whynot )
{
	if( !this->CanAddParent( parent, whynot ) ) {
		return false;
	}

	if( HasParent( parent ) ) {
		debug_printf( DEBUG_QUIET,
					"Warning: child %s already has parent %s\n",
					GetJobName(), parent->GetJobName() );
		check_warning_strictness( DAG_STRICT_3 );
		return true;
	}

	if( !Add( Q_PARENTS, parent->GetJobID() ) ) {
		whynot = "unknown error appending to PARENTS queue";
		return false;
	}
    if( parent->GetStatus() != STATUS_DONE ) {
		if( !Add( Q_WAITING, parent->GetJobID() ) ) {
            // this node's dependency queues are now out of sync and
            // thus the DAG state is FUBAR, so we should bail...
			EXCEPT( "Failed to add parent %s to job %s",
						parent->GetJobName(), GetJobName() );
			return false;
		}
	}
	whynot = "n/a";
    return true;
}
Beispiel #2
0
bool
Job::SetStatus( status_t newStatus )
{
	debug_printf( DEBUG_DEBUG_1, "Job(%s)::_Status = %s\n",
		GetJobName(), status_t_names[_Status] );

	debug_printf( DEBUG_DEBUG_1, "Job(%s)::SetStatus(%s)\n",
		GetJobName(), status_t_names[newStatus] );
	
	_Status = newStatus;
		// TODO: add some state transition sanity-checking here?
	return true;
}
/*
========================
idParallelJobList_Threads::AddJob
========================
*/
ID_INLINE void idParallelJobList_Threads::AddJob( jobRun_t function, void* data )
{
	assert( done );
#if defined( _DEBUG )
	// make sure there isn't already a job with the same function and data in the list
	if( jobList.Num() < 1000 )  	// don't do this N^2 slow check on big lists
	{
		for( int i = 0; i < jobList.Num(); i++ )
		{
			assert( jobList[i].function != function || jobList[i].data != data );
		}
	}
#endif
	if( 1 )    // JDC: this never worked in tech5!  !jobList.IsFull() ) {
	{
		job_t& job = jobList.Alloc();
		job.function = function;
		job.data = data;
		job.executed = 0;
	}
	else
	{
		// debug output to show us what is overflowing
		int currentJobCount[MAX_REGISTERED_JOBS] = {};
		
		for( int i = 0; i < jobList.Num(); ++i )
		{
			const char* jobName = GetJobName( jobList[ i ].function );
			for( int j = 0; j < numRegisteredJobs; ++j )
			{
				if( jobName == registeredJobs[ j ].name )
				{
					currentJobCount[ j ]++;
					break;
				}
			}
		}
		
		// print the quantity of each job type
		for( int i = 0; i < numRegisteredJobs; ++i )
		{
			if( currentJobCount[ i ] > 0 )
			{
				idLib::Printf( "Job: %s, # %d", registeredJobs[ i ].name, currentJobCount[ i ] );
			}
		}
		idLib::Error( "Can't add job '%s', too many jobs %d", GetJobName( function ), jobList.Num() );
	}
}
Beispiel #4
0
void
Job::SetCategory( const char *categoryName, ThrottleByCategory &catThrottles )
{
	MyString	tmpName( categoryName );

	if ( (_throttleInfo != NULL) &&
				(tmpName != *(_throttleInfo->_category)) ) {
		debug_printf( DEBUG_NORMAL, "Warning: new category %s for node %s "
					"overrides old value %s\n", categoryName, GetJobName(),
					_throttleInfo->_category->Value() );
		check_warning_strictness( DAG_STRICT_3 );
	}

		// Note: we must assign a ThrottleInfo here even if the name
		// already matches, for the case of lifting splices.
	ThrottleByCategory::ThrottleInfo *oldInfo = _throttleInfo;

	ThrottleByCategory::ThrottleInfo *throttleInfo =
				catThrottles.GetThrottleInfo( &tmpName );
	if ( throttleInfo != NULL ) {
		_throttleInfo = throttleInfo;
	} else {
		_throttleInfo = catThrottles.AddCategory( &tmpName );
	}

	if ( oldInfo != _throttleInfo ) {
		if ( oldInfo != NULL ) {
			oldInfo->_totalJobs--;
		}
		_throttleInfo->_totalJobs++;
	}
}
Beispiel #5
0
wxString BuildTag::ToString() const
{
	wxString str = wxString::Format(_T("%s-%s-%i"), 
		GetBuildType().c_str(), GetJobName().c_str(), GetBuildNum());

	return str;
}
Beispiel #6
0
// iterate across the Job's var values, and for any which have $(JOB) in them, 
// substitute it. This substitution is draconian and will always happen.
void
Job::ResolveVarsInterpolations(void)
{
	NodeVar *var;

	varsFromDag->Rewind();
	while( (var = varsFromDag->Next()) != NULL ) {
		// XXX No way to escape $(JOB) in case, for some crazy reason, you
		// want a filename component actually to be '$(JOB)'.
		// It isn't hard to fix, I'll do it later.
		var->_value.replaceString("$(JOB)", GetJobName());
	}
}
Beispiel #7
0
//---------------------------------------------------------------------------
bool
Job::SetCondorID(const CondorID& cid)
{
	bool ret = true;
	if(GetCluster() != -1) {
		debug_printf( DEBUG_NORMAL, "Reassigning the id of job %s from (%d.%d.%d) to "
			"(%d.%d.%d)\n", GetJobName(), GetCluster(), GetProc(), GetSubProc(),
			cid._cluster, cid._proc,cid._subproc );
			ret = false;
	}
	_CondorID = cid;
	return ret;	
}
Beispiel #8
0
//---------------------------------------------------------------------------
bool
Job::UnmonitorLogFile( ReadMultipleUserLogs &condorLogReader,
			ReadMultipleUserLogs &storkLogReader )
{
	debug_printf( DEBUG_DEBUG_2, "Unmonitoring log file <%s> for node %s\n",
				GetLogFile(), GetJobName() );

	if ( !_logIsMonitored ) {
		debug_printf( DEBUG_DEBUG_1, "Warning: log file for node "
					"%s is already unmonitored\n", GetJobName() );
		return true;
	}

	ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ?
				condorLogReader : storkLogReader;

	debug_printf( DEBUG_DEBUG_1, "Unmonitoring log file <%s> for node %s\n",
				GetLogFile(), GetJobName() );

	CondorError errstack;
	bool result = logReader.unmonitorLogFile( GetLogFile(), errstack );
	if ( !result ) {
		errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE,
					"ERROR: Unable to unmonitor log " "file for node %s",
					GetJobName() );
		debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() );
		EXCEPT( "Fatal log file monitoring error!\n" );
	}

	if ( result ) {
		delete [] _logFile;
		_logFile = NULL;
		_logIsMonitored = false;
	}

	return result;
}
Beispiel #9
0
//---------------------------------------------------------------------------
bool
Job::Release(int proc)
{
	if( proc >= static_cast<int>( _onHold.size() ) ) {
		dprintf( D_FULLDEBUG, "Received release event for node %s, but job %d.%d "
			"is not on hold\n", GetJobName(), GetCluster(), GetProc() );
		return false; // We never marked this as being on hold
	}
	if( _onHold[proc] ) {
		_onHold[proc] = 0;
		--_jobProcsOnHold;
		return true;
	}
	return false;
}
Beispiel #10
0
//---------------------------------------------------------------------------
bool
Job::Hold(int proc) 
{
	if( proc >= static_cast<int>( _onHold.size() ) ) {
		_onHold.resize( proc+1, 0 );
	}
	if( !_onHold[proc] ) {
		_onHold[proc] = 1;
		++_jobProcsOnHold;
		++_timesHeld;
		return true;
	} else {
		dprintf( D_FULLDEBUG, "Received hold event for node %s, and job %d.%d "
			"is already on hold!\n", GetJobName(), GetCluster(), proc );
	}
	return false;
}
Beispiel #11
0
//---------------------------------------------------------------------------
void
Job::TermAbortMetrics( int proc, const struct tm &eventTime,
			DagmanMetrics *metrics )
{
	if ( proc >= static_cast<int>( _gotEvents.size() ) ) {
		debug_printf( DEBUG_NORMAL,
					"Warning for node %s: got terminated or aborted event for proc %d, but no execute event!\n",
					GetJobName(), proc );
		check_warning_strictness( DAG_STRICT_2 );

		_gotEvents.resize( proc+1, 0 );
	}

	if ( !( _gotEvents[proc] & ABORT_TERM_MASK ) ) {
		_gotEvents[proc] |= ABORT_TERM_MASK;
		metrics->ProcFinished( eventTime );
	}
}
Beispiel #12
0
//---------------------------------------------------------------------------
void
Job::Cleanup()
{
	std::vector<unsigned char> s;
	_onHold.swap(s); // Free memory in _onHold

	for ( int proc = 0; proc < static_cast<int>( _gotEvents.size() );
				proc++ ) {
		if ( _gotEvents[proc] != ( EXEC_MASK | ABORT_TERM_MASK ) ) {
			debug_printf( DEBUG_NORMAL,
					"Warning for node %s: unexpected _gotEvents value for proc %d: %d!\n",
					GetJobName(), proc, (int)_gotEvents[proc] );
			check_warning_strictness( DAG_STRICT_2 );
		}
	}

	std::vector<unsigned char> s2;
	_gotEvents.swap(s2); // Free memory in _gotEvents
}
Beispiel #13
0
// OutputToStream()
//  Outputs an ImageData instance as ASCII text to the stream provided.
void ImageData::OutputToStream( ostream &out ) {
  out << " Filename:             \"" << GetFilename() << "\"" << endl;
  out << "  File Format:         " << GetFileFormat() << "; \"" << ImageFileFormatStrings[ GetFileFormat() ] << "\"" << endl;
  out << "  File Size:           " << GetFileSize() << " bytes " << endl;
  out << endl;
  out << " File Data:" << endl;
  out << "  Bits Per Pixel:      " << GetBitsPerPixel() << endl;
  out << "  Compression:         " << GetCompression() << "; \"" << ImageCompressionModeStrings[ GetCompression() ] << "\"" << endl;
  out << "  DPI X/Y:             " << GetDPIX() << "/" <<  GetDPIY() << endl;
  out << "  Pixel Aspect Ration: " << GetPixelAspectNum() << "/" <<  GetPixelAspectDenom() << endl;
  out << "  Gamma Correction:    " << GetGammaNum() << "/" <<  GetGammaDenom() << endl;
  out << "  Thumbnail:           ";
  if( GetThumbnail() == NULL )
    out << "No" << endl;
  else
    out << "Yes" << endl;

  out << endl;
  out << " Creator Data:" << endl;
  out << "  Author:              \"" << GetAuthor() << "\"" << endl;
  out << "  Creator Program:     \"" << GetCreator() << "\"" << endl;
  out << "  Creator Version:     "   << GetCreatorVersion() << "." << GetCreatorRevision() << GetCreatorSubRev() << endl;
  out << "  Comment:             \"" << GetComment() << "\"" << endl;
  out << "  Job Name:            \"" << GetJobName() << "\"" << endl;
  out << "  Job Time:            "   << GetJobTime()[0] << ":" << GetJobTime()[1] << ":" << GetJobTime()[2] << endl;
  out << "  Date Stamp:          "   << GetDateStamp()[0] << "/" << GetDateStamp()[1] << "/" << GetDateStamp()[2] << "  "
                                     << GetDateStamp()[3] << ":" << GetDateStamp()[4] << ":" << GetDateStamp()[5] << endl;
  out << endl;

  out << " Image Data:" << endl;
  out << "  Width:               " << GetImage()->GetWidth() << endl;
  out << "  Height:              " << GetImage()->GetHeight() << endl;
  out << "  Type:                " << GetImage()->GetType() << "; \"" << ImageTypeStrings[ GetImage()->GetType() ] << "\"" << endl;
  out << "  Num Registers:       ";
  if( GetImage()->GetType() == IMAGE_INDEXED )
    out << GetImage()->GetNumRegisters() << endl;
  else
    out << "N/A" << endl;

}
Beispiel #14
0
bool
Job::AddChild( Job* child, MyString &whynot )
{
	if( !this->CanAddChild( child, whynot ) ) {
		return false;
	}

	if( HasChild( child ) ) {
		debug_printf( DEBUG_NORMAL,
					"Warning: parent %s already has child %s\n",
					GetJobName(), child->GetJobName() );
		check_warning_strictness( DAG_STRICT_3 );
		return true;
	}

	if( !Add( Q_CHILDREN, child->GetJobID() ) ) {
		whynot = "unknown error appending to CHILDREN queue";
		return false;
	}
	whynot = "n/a";
    return true;
}
Beispiel #15
0
LONG
SaveJob(
    _In_ PJOB pJob)
{
    SCHEDULE Schedule;
    HKEY hJobsKey = NULL, hJobKey = NULL;
    LONG lError;

    TRACE("SaveJob()\n");

    lError = RegCreateKeyExW(HKEY_LOCAL_MACHINE,
                             L"System\\CurrentControlSet\\Services\\Schedule\\Jobs",
                             0,
                             NULL,
                             REG_OPTION_NON_VOLATILE,
                             KEY_WRITE,
                             NULL,
                             &hJobsKey,
                             NULL);
    if (lError != ERROR_SUCCESS)
        goto done;

    GetJobName(hJobsKey, pJob->Name);

    lError = RegCreateKeyExW(hJobsKey,
                             pJob->Name,
                             0,
                             NULL,
                             REG_OPTION_NON_VOLATILE,
                             KEY_WRITE,
                             NULL,
                             &hJobKey,
                             NULL);
    if (lError != ERROR_SUCCESS)
        goto done;

    Schedule.JobTime = pJob->JobTime;
    Schedule.DaysOfMonth = pJob->DaysOfMonth;
    Schedule.DaysOfWeek = pJob->DaysOfWeek;
    Schedule.Flags = pJob->Flags;

    lError = RegSetValueEx(hJobKey,
                           L"Schedule",
                           0,
                           REG_BINARY,
                           (PBYTE)&Schedule,
                           sizeof(Schedule));
    if (lError != ERROR_SUCCESS)
        goto done;

    lError = RegSetValueEx(hJobKey,
                           L"Command",
                           0,
                           REG_SZ,
                           (PBYTE)pJob->Command,
                           (wcslen(pJob->Command) + 1) * sizeof(WCHAR));
    if (lError != ERROR_SUCCESS)
        goto done;

done:
    if (hJobKey != NULL)
        RegCloseKey(hJobKey);

    if (hJobsKey != NULL)
        RegCloseKey(hJobsKey);

    return lError;
}
/*
========================
idParallelJobList_Threads::RunJobsInternal
========================
*/
int idParallelJobList_Threads::RunJobsInternal( unsigned int threadNum, threadJobListState_t& state, bool singleJob )
{
	if( state.version != version.GetValue() )
	{
		// trying to run an old version of this list that is already done
		return RUN_DONE;
	}
	
	assert( threadNum < MAX_THREADS );
	
	if( deferredThreadStats.startTime == 0 )
	{
		deferredThreadStats.startTime = Sys_Microseconds();	// first time any thread is running jobs from this list
	}
	
	int result = RUN_OK;
	
	do
	{
	
		// run through all signals and syncs before the last job that has been or is being executed
		// this loop is really an optimization to minimize the time spent in the fetchLock section below
		for( ; state.lastJobIndex < ( int ) currentJob.GetValue() && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ )
		{
			if( jobList[state.lastJobIndex].data == & JOB_SIGNAL )
			{
				state.signalIndex++;
				assert( state.signalIndex < signalJobCount.Num() );
			}
			else if( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE )
			{
				assert( state.signalIndex > 0 );
				if( signalJobCount[state.signalIndex - 1].GetValue() > 0 )
				{
					// stalled on a synchronization point
					return ( result | RUN_STALLED );
				}
			}
			else if( jobList[state.lastJobIndex].data == & JOB_LIST_DONE )
			{
				if( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 )
				{
					// stalled on a synchronization point
					return ( result | RUN_STALLED );
				}
			}
		}
		
		// try to lock to fetch a new job
		if( fetchLock.Increment() == 1 )
		{
		
			// grab a new job
			state.nextJobIndex = currentJob.Increment() - 1;
			
			// run through any remaining signals and syncs (this should rarely iterate more than once)
			for( ; state.lastJobIndex <= state.nextJobIndex && state.lastJobIndex < jobList.Num(); state.lastJobIndex++ )
			{
				if( jobList[state.lastJobIndex].data == & JOB_SIGNAL )
				{
					state.signalIndex++;
					assert( state.signalIndex < signalJobCount.Num() );
				}
				else if( jobList[state.lastJobIndex].data == & JOB_SYNCHRONIZE )
				{
					assert( state.signalIndex > 0 );
					if( signalJobCount[state.signalIndex - 1].GetValue() > 0 )
					{
						// return this job to the list
						currentJob.Decrement();
						// release the fetch lock
						fetchLock.Decrement();
						// stalled on a synchronization point
						return ( result | RUN_STALLED );
					}
				}
				else if( jobList[state.lastJobIndex].data == & JOB_LIST_DONE )
				{
					if( signalJobCount[signalJobCount.Num() - 1].GetValue() > 0 )
					{
						// return this job to the list
						currentJob.Decrement();
						// release the fetch lock
						fetchLock.Decrement();
						// stalled on a synchronization point
						return ( result | RUN_STALLED );
					}
					// decrement the done count
					doneGuards[currentDoneGuard].Decrement();
				}
			}
			// release the fetch lock
			fetchLock.Decrement();
		}
		else
		{
			// release the fetch lock
			fetchLock.Decrement();
			// another thread is fetching right now so consider stalled
			return ( result | RUN_STALLED );
		}
		
		// if at the end of the job list we're done
		if( state.nextJobIndex >= jobList.Num() )
		{
			return ( result | RUN_DONE );
		}
		
		// execute the next job
		{
			uint64 jobStart = Sys_Microseconds();
			
			jobList[state.nextJobIndex].function( jobList[state.nextJobIndex].data );
			jobList[state.nextJobIndex].executed = 1;
			
			uint64 jobEnd = Sys_Microseconds();
			deferredThreadStats.threadExecTime[threadNum] += jobEnd - jobStart;
			
#ifndef _DEBUG
			if( jobs_longJobMicroSec.GetInteger() > 0 )
			{
				if( jobEnd - jobStart > jobs_longJobMicroSec.GetInteger()
						&& GetId() != JOBLIST_UTILITY )
				{
					longJobTime = ( jobEnd - jobStart ) * ( 1.0f / 1000.0f );
					longJobFunc = jobList[state.nextJobIndex].function;
					longJobData = jobList[state.nextJobIndex].data;
					const char* jobName = GetJobName( jobList[state.nextJobIndex].function );
					const char* jobListName = GetJobListName( GetId() );
					idLib::Printf( "%1.1f milliseconds for a single '%s' job from job list %s on thread %d\n", longJobTime, jobName, jobListName, threadNum );
				}
			}
#endif
		}
		
		result |= RUN_PROGRESS;
		
		// decrease the job count for the current signal
		if( signalJobCount[state.signalIndex].Decrement() == 0 )
		{
			// if this was the very last job of the job list
			if( state.signalIndex == signalJobCount.Num() - 1 )
			{
				deferredThreadStats.endTime = Sys_Microseconds();
				return ( result | RUN_DONE );
			}
		}
		
	}
	while( ! singleJob );
	
	return result;
}
Beispiel #17
0
//---------------------------------------------------------------------------
bool
Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader,
			ReadMultipleUserLogs &storkLogReader, bool nfsIsError,
			bool recovery, const char *defaultNodeLog, bool usingDefault )
{
	debug_printf( DEBUG_DEBUG_2,
				"Attempting to monitor log file for node %s\n",
				GetJobName() );

	if ( _logIsMonitored ) {
		debug_printf( DEBUG_DEBUG_1, "Warning: log file for node "
					"%s is already monitored\n", GetJobName() );
		return true;
	}

	ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ?
				condorLogReader : storkLogReader;

    std::string logFileStr;
	if ( _jobType == TYPE_CONDOR ) {
			// We check to see if the user has specified a log file
			// If not, we give him a default
    	MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile,
					_directory, _logFileIsXml, usingDefault);
		logFileStr = templogFileStr.Value();
	} else {
		StringList logFiles;
		MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile(
					_cmdFile, _directory, logFiles );
		if ( tmpResult != "" ) {
			debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n",
						tmpResult.Value() );
			LogMonitorFailed();
			return false;
		} else if ( logFiles.number() != 1 ) {
			debug_printf( DEBUG_QUIET, "Error: %d Stork log files found "
						"in submit file %s; we want 1\n",
						logFiles.number(), _cmdFile );
			LogMonitorFailed();
			return false;
		} else {
			logFiles.rewind();
			logFileStr = logFiles.next();
		}
	}

		// Warn the user if the node's log file is in /tmp.
	if ( logFileStr.find( "/tmp" ) == 0 ) {
		debug_printf( DEBUG_QUIET, "Warning: "
					"Log file %s for node %s is in /tmp\n",
					logFileStr.c_str(), GetJobName() );
        check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 );
	}

	if ( logFileStr == "" ) {
		logFileStr = defaultNodeLog;
		_useDefaultLog = true;
			// Default User log is never XML
			// This could be specified in the submit file and should be
			// ignored.
		_logFileIsXml = false;
		debug_printf( DEBUG_NORMAL, "Unable to get log file from "
					"submit file %s (node %s); using default (%s)\n",
					_cmdFile, GetJobName(), logFileStr.c_str() );
		append_default_log = false;
	} else {
		append_default_log = usingDefault;
		if( append_default_log ) {
				// DAGman is not going to look at the user-specified log.
				// It will look at the defaultNode log.
			logFileStr = defaultNodeLog;
			_useDefaultLog = false;
			_logFileIsXml = false;
		}
	}

		// This function returns true if the log file is on NFS and
		// that is an error.  If the log file is on NFS, but nfsIsError
		// is false, it prints a warning but returns false.
	if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(),
				nfsIsError ) ) {
		debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n",
					logFileStr.c_str() );
		LogMonitorFailed();
		return false;
	}

	delete [] _logFile;
		// Saving log file here in case submit file gets changed.
	_logFile = strnewp( logFileStr.c_str() );
	debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n",
				GetLogFile(), GetJobName() );
	CondorError errstack;
	if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) {
		errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE,
					"ERROR: Unable to monitor log file for node %s",
					GetJobName() );
		debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() );
		LogMonitorFailed();
		EXCEPT( "Fatal log file monitoring error!\n" );
		return false;
	}

	_logIsMonitored = true;

	return true;
}