Пример #1
0
//---------------------------------------------------------------------------
// This does only partial parsing -- only what we need for recovery mode
// and rescue initialization.
bool
JobstateLog::ParseLine( MyString &line, time_t &timestamp,
			MyString &nodeName, int &seqNum )
{
	line.chomp();
	line.Tokenize();
	const char* timestampTok = line.GetNextToken( " ", false );
	const char* nodeNameTok = line.GetNextToken( " ", false );
	(void)line.GetNextToken( " ", false ); // event name
	(void)line.GetNextToken( " ", false ); // condor id
	(void)line.GetNextToken( " ", false ); // job tag (pegasus site)
	(void)line.GetNextToken( " ", false ); // unused
	const char* seqNumTok = line.GetNextToken( " ", false );

	if ( (timestampTok == NULL) || (nodeNameTok == NULL) ) {
		debug_printf( DEBUG_QUIET, "Warning: error parsing "
					"jobstate.log file line <%s>\n", line.Value() );
		check_warning_strictness( DAG_STRICT_1 );
		return false;
	}

		// fetch the number, and get a pointer to the first char after
		// if the pointer did not advance, then there was no number to parse.
	char *pend;
	timestamp = (time_t)strtoll(timestampTok, &pend, 10);

	if (pend == timestampTok) {
		debug_printf( DEBUG_QUIET, "Warning: error reading "
					"timestamp in jobstate.log file line <%s>\n",
					line.Value() );
		check_warning_strictness( DAG_STRICT_1 );
		return false;
	}

	nodeName = nodeNameTok;

	seqNum = 0;
	if ( seqNumTok ) {
		seqNum = (int)strtol(seqNumTok, &pend, 10);
		if (pend == seqNumTok) {
			debug_printf( DEBUG_QUIET, "Warning: error reading "
						"sequence number in jobstate.log file line <%s>\n",
						line.Value() );
			check_warning_strictness( DAG_STRICT_1 );
			return false;
		}
	}

	return true;
}
Пример #2
0
void
Job::SetCategory( const char *categoryName, ThrottleByCategory &catThrottles )
{
	MyString	tmpName( categoryName );

	if ( (_throttleInfo != NULL) &&
				(tmpName != *(_throttleInfo->_category)) ) {
		debug_printf( DEBUG_NORMAL, "Warning: new category %s for node %s "
					"overrides old value %s\n", categoryName, GetJobName(),
					_throttleInfo->_category->Value() );
		check_warning_strictness( DAG_STRICT_3 );
	}

		// Note: we must assign a ThrottleInfo here even if the name
		// already matches, for the case of lifting splices.
	ThrottleByCategory::ThrottleInfo *oldInfo = _throttleInfo;

	ThrottleByCategory::ThrottleInfo *throttleInfo =
				catThrottles.GetThrottleInfo( &tmpName );
	if ( throttleInfo != NULL ) {
		_throttleInfo = throttleInfo;
	} else {
		_throttleInfo = catThrottles.AddCategory( &tmpName );
	}

	if ( oldInfo != _throttleInfo ) {
		if ( oldInfo != NULL ) {
			oldInfo->_totalJobs--;
		}
		_throttleInfo->_totalJobs++;
	}
}
Пример #3
0
//---------------------------------------------------------------------------
void
JobstateLog::WriteEvent( const ULogEvent *event, Job *node )
{
	if ( !_jobstateLogFile ) {
		return;
	}

	ASSERT( node );

	const char *prefix = "ULOG_";
	const char *eventName = ULogEventNumberNames[event->eventNumber];
	if ( strstr( eventName, prefix ) != eventName ) {
       	debug_printf( DEBUG_QUIET, "Warning: didn't find expected prefix "
					"%s in event name %s\n", prefix, eventName );
		check_warning_strictness( DAG_STRICT_1 );
	} else {
		eventName = eventName + strlen( prefix );
	}

	if ( eventName != NULL ) {
		MyString condorID;
		CondorID2Str( event->cluster, event->proc, condorID );
		struct tm eventTm = event->eventTime;
		time_t eventTime = mktime( &eventTm );
		Write( &eventTime, node, eventName, condorID.Value() );
	}
}
Пример #4
0
bool
Job::AddParent( Job* parent, MyString &whynot )
{
	if( !this->CanAddParent( parent, whynot ) ) {
		return false;
	}

	if( HasParent( parent ) ) {
		debug_printf( DEBUG_QUIET,
					"Warning: child %s already has parent %s\n",
					GetJobName(), parent->GetJobName() );
		check_warning_strictness( DAG_STRICT_3 );
		return true;
	}

	if( !Add( Q_PARENTS, parent->GetJobID() ) ) {
		whynot = "unknown error appending to PARENTS queue";
		return false;
	}
    if( parent->GetStatus() != STATUS_DONE ) {
		if( !Add( Q_WAITING, parent->GetJobID() ) ) {
            // this node's dependency queues are now out of sync and
            // thus the DAG state is FUBAR, so we should bail...
			EXCEPT( "Failed to add parent %s to job %s",
						parent->GetJobName(), GetJobName() );
			return false;
		}
	}
	whynot = "n/a";
    return true;
}
Пример #5
0
//---------------------------------------------------------------------------
void
DagmanClassad::GetInfo( MyString &owner, MyString &nodeName )
{
	if ( !_valid ) {
		debug_printf( DEBUG_VERBOSE,
					"Skipping ClassAd query -- DagmanClassad object is invalid\n" );
		return;
	}

	Qmgr_connection *queue = OpenConnection();
	if ( !queue ) {
		return;
	}

	if ( !GetDagAttribute( ATTR_OWNER, owner ) ) {
		check_warning_strictness( DAG_STRICT_1 );
		owner = "undef";
	}

	if ( !GetDagAttribute( ATTR_DAG_NODE_NAME, nodeName ) ) {
		// We should only get this value if we're a sub-DAG.
		nodeName = "undef";
	}

	CloseConnection( queue );

	return;
}
Пример #6
0
//---------------------------------------------------------------------------
DagmanClassad::DagmanClassad( const CondorID &DAGManJobId ) :
	_valid( false ),
	_schedd( NULL )
{
	CondorID defaultCondorId;
	if ( DAGManJobId == defaultCondorId ) {
		debug_printf( DEBUG_QUIET, "No HTCondor ID available for DAGMan (running on command line?); DAG status will not be reported to ClassAd\n" );
		return;
	}

	_dagmanId = DAGManJobId;

	_schedd = new DCSchedd( NULL, NULL );
	if ( !_schedd || !_schedd->locate() ) {
		const char *errMsg = _schedd ? _schedd->error() : "?";
		debug_printf( DEBUG_QUIET,
					"WARNING: can't find address of local schedd for ClassAd updates (%s)\n",
					errMsg );
		check_warning_strictness( DAG_STRICT_3 );
		return;
	}

	_valid = true;

	InitializeMetrics();
}
Пример #7
0
//---------------------------------------------------------------------------
void
DagmanClassad::CloseConnection( Qmgr_connection *queue )
{
	if ( !DisconnectQ( queue ) ) {
		debug_printf( DEBUG_QUIET,
					"WARNING: queue transaction failed.  No attributes were set.\n" );
		check_warning_strictness( DAG_STRICT_3 );
	}
}
Пример #8
0
//---------------------------------------------------------------------------
void
DagmanClassad::SetDagAttribute( const char *attrName, const ClassAd &ad )
{
	if ( SetAttributeExpr( _dagmanId._cluster, _dagmanId._proc,
						  attrName, &ad ) != 0 ) {
		debug_printf( DEBUG_QUIET,
					  "WARNING: failed to set attribute %s\n", attrName );
		check_warning_strictness( DAG_STRICT_3 );
	}
}
Пример #9
0
bool
AddNode( Dag *dag, const char *name,
		 const char* directory,
		 const char* submitFile,
		 bool noop,
		 bool done, bool isFinal,
		 MyString &failReason )
{
	MyString why;
	if( !IsValidNodeName( dag, name, why ) ) {
		failReason = why;
		return false;
	}
	if( !IsValidSubmitFileName( submitFile, why ) ) {
		failReason = why;
		return false;
	}
	if( done && isFinal) {
		failReason.formatstr( "Warning: FINAL Job %s cannot be set to DONE\n",
					name );
        debug_printf( DEBUG_QUIET, "%s", failReason.Value() );
		(void)check_warning_strictness( DAG_STRICT_1, false );
		done = false;
	}
	Job* node = new Job( name, directory, submitFile );
	if( !node ) {
		dprintf( D_ALWAYS, "ERROR: out of memory!\n" );
			// we already know we're out of memory, so filling in
			// FailReason will likely fail, but give it a shot...
		failReason = "out of memory!";
		return false;
	}
	node->SetNoop( noop );
	if( done ) {
		node->SetStatus( Job::STATUS_DONE );
	}
	node->SetFinal( isFinal );
	ASSERT( dag != NULL );
	if( !dag->Add( *node ) ) {
		failReason = "unknown failure adding ";
		failReason += isFinal? "Final " : "";
		failReason += "node to DAG";
		delete node;
		return false;
	}
	failReason = "n/a";
	return true;
}
Пример #10
0
//---------------------------------------------------------------------------
Qmgr_connection *
DagmanClassad::OpenConnection()
{
		// Open job queue
	CondorError errstack;
	Qmgr_connection *queue = ConnectQ( _schedd->addr(), 0, false,
				&errstack, NULL, _schedd->version() );
	if ( !queue ) {
		debug_printf( DEBUG_QUIET,
					"WARNING: failed to connect to queue manager (%s)\n",
					errstack.getFullText().c_str() );
		check_warning_strictness( DAG_STRICT_3 );
		return NULL;
	}

	return queue;
}
Пример #11
0
//---------------------------------------------------------------------------
void
Job::TermAbortMetrics( int proc, const struct tm &eventTime,
			DagmanMetrics *metrics )
{
	if ( proc >= static_cast<int>( _gotEvents.size() ) ) {
		debug_printf( DEBUG_NORMAL,
					"Warning for node %s: got terminated or aborted event for proc %d, but no execute event!\n",
					GetJobName(), proc );
		check_warning_strictness( DAG_STRICT_2 );

		_gotEvents.resize( proc+1, 0 );
	}

	if ( !( _gotEvents[proc] & ABORT_TERM_MASK ) ) {
		_gotEvents[proc] |= ABORT_TERM_MASK;
		metrics->ProcFinished( eventTime );
	}
}
Пример #12
0
//---------------------------------------------------------------------------
void
Job::Cleanup()
{
	std::vector<unsigned char> s;
	_onHold.swap(s); // Free memory in _onHold

	for ( int proc = 0; proc < static_cast<int>( _gotEvents.size() );
				proc++ ) {
		if ( _gotEvents[proc] != ( EXEC_MASK | ABORT_TERM_MASK ) ) {
			debug_printf( DEBUG_NORMAL,
					"Warning for node %s: unexpected _gotEvents value for proc %d: %d!\n",
					GetJobName(), proc, (int)_gotEvents[proc] );
			check_warning_strictness( DAG_STRICT_2 );
		}
	}

	std::vector<unsigned char> s2;
	_gotEvents.swap(s2); // Free memory in _gotEvents
}
Пример #13
0
bool
Job::AddChild( Job* child, MyString &whynot )
{
	if( !this->CanAddChild( child, whynot ) ) {
		return false;
	}

	if( HasChild( child ) ) {
		debug_printf( DEBUG_NORMAL,
					"Warning: parent %s already has child %s\n",
					GetJobName(), child->GetJobName() );
		check_warning_strictness( DAG_STRICT_3 );
		return true;
	}

	if( !Add( Q_CHILDREN, child->GetJobID() ) ) {
		whynot = "unknown error appending to CHILDREN queue";
		return false;
	}
	whynot = "n/a";
    return true;
}
Пример #14
0
//-----------------------------------------------------------------------------
int util_popen (ArgList &args) {
	MyString cmd; // for debug output
	args.GetArgsStringForDisplay( &cmd );
    debug_printf( DEBUG_VERBOSE, "Running: %s\n", cmd.Value() );

	FILE *fp = my_popen( args, "r", MY_POPEN_OPT_WANT_STDERR );

    int r = 0;
    if (fp == NULL || (r = my_pclose(fp) & 0xff) != 0) {
		debug_printf( DEBUG_QUIET, "Warning: failure: %s\n", cmd.Value() );
		if( fp != NULL ) {
			debug_printf ( DEBUG_QUIET,
						"\t(my_pclose() returned %d (errno %d, %s))\n",
						r, errno, strerror( errno ) );
		} else {
			debug_printf ( DEBUG_QUIET,
						"\t(my_popen() returned NULL (errno %d, %s))\n",
						errno, strerror( errno ) );
			r = -1;
		}
		check_warning_strictness( DAG_STRICT_1 );
    }
    return r;
}
Пример #15
0
//-----------------------------------------------------------------------------
int
Script::BackgroundRun( int reaperId, int dagStatus, int failedCount )
{
	TmpDir		tmpDir;
	MyString	errMsg;
	if ( !tmpDir.Cd2TmpDir( _node->GetDirectory(), errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to node directory %s: %s\n",
				_node->GetDirectory(), errMsg.Value() );

		return 0;
	}

	// Construct the command line, replacing some tokens with
    // information about the job.  All of these values would probably
    // be better inserted into the environment, rather than passed on
    // the command-line... some should be in the job's env as well...

    const char *delimiters = " \t";
    char * token;
	ArgList args;
    char * cmd = strnewp(_cmd);
    for (token = strtok (cmd,  delimiters) ; token != NULL ;
         token = strtok (NULL, delimiters)) {

		MyString arg;

		if ( !strcasecmp( token, "$JOB" ) ) {
			arg += _node->GetJobName();

		} else if ( !strcasecmp( token, "$RETRY" ) ) {
            arg += _node->GetRetries();

		} else if ( !strcasecmp( token, "$MAX_RETRIES" ) ) {
            arg += _node->GetRetryMax();

        } else if ( !strcasecmp( token, "$JOBID" ) ) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $JOBID macro should "
							"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
				arg += token;
			} else {
            	arg += _node->_CondorID._cluster;
            	arg += '.';
            	arg += _node->_CondorID._proc;
			}

        } else if (!strcasecmp(token, "$RETURN")) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $RETURN macro should "
							"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
			}
			arg += _retValJob;

		} else if (!strcasecmp( token, "$PRE_SCRIPT_RETURN" ) ) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $PRE_SCRIPT_RETURN macro should "
						"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
			}
			arg += _retValScript;

		} else if (!strcasecmp(token, "$DAG_STATUS")) {
			arg += dagStatus;

		} else if (!strcasecmp(token, "$FAILED_COUNT")) {
			arg += failedCount;

		} else if (token[0] == '$') {
			// This should probably be a fatal error when -strict is
			// implemented.
			debug_printf( DEBUG_QUIET, "Warning: unrecognized macro %s "
						"in node %s %s script arguments\n", token,
						_node->GetJobName(), _post ? "POST" : "PRE" );
			check_warning_strictness( DAG_STRICT_1 );
			arg += token;
        } else {
			arg += token;
		}

		args.AppendArg(arg.Value());
    }

	_pid = daemonCore->Create_Process( cmd, args,
									   PRIV_UNKNOWN, reaperId, FALSE,
									   NULL, NULL, NULL, NULL, NULL, 0 );
    delete [] cmd;

	if ( !tmpDir.Cd2MainDir( errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to original directory: %s\n",
				errMsg.Value() );
		return 0;
	}

	return _pid;
}
Пример #16
0
//-------------------------------------------------------------------------
bool
condor_submit( const Dagman &dm, const char* cmdFile, CondorID& condorID,
			   const char* DAGNodeName, MyString &DAGParentNodeNames,
			   List<Job::NodeVar> *vars, int retry,
			   const char* directory, const char *workflowLogFile,
			   bool hold_claim )
{
	TmpDir		tmpDir;
	MyString	errMsg;
	if ( !tmpDir.Cd2TmpDir( directory, errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to node directory %s: %s\n",
				directory, errMsg.Value() );
		return false;
	}

	ArgList args;

	// construct arguments to condor_submit to add attributes to the
	// job classad which identify the job's node name in the DAG, the
	// node names of its parents in the DAG, and the job ID of DAGMan
	// itself; then, define submit_event_notes to print the job's node
	// name inside the submit event in the userlog

	// NOTE: we specify the job ID of DAGMan using only its cluster ID
	// so that it may be referenced by jobs in their priority
	// attribute (which needs an int, not a string).  Doing so allows
	// users to effectively "batch" jobs by DAG so that when they
	// submit many DAGs to the same schedd, all the ready jobs from
	// one DAG complete before any jobs from another begin.

	args.AppendArg( dm.condorSubmitExe );

	args.AppendArg( "-a" );
	MyString nodeName = MyString(ATTR_DAG_NODE_NAME_ALT) + " = " + DAGNodeName;
	args.AppendArg( nodeName.Value() );

		// append a line adding the parent DAGMan's cluster ID to the job ad
	args.AppendArg( "-a" );
	MyString dagJobId = MyString( "+" ) + ATTR_DAGMAN_JOB_ID + " = " +
				dm.DAGManJobId._cluster;
	args.AppendArg( dagJobId.Value() );

		// now we append a line setting the same thing as a submit-file macro
		// (this is necessary so the user can reference it in the priority)
	args.AppendArg( "-a" );
	MyString dagJobIdMacro = MyString( "" ) + ATTR_DAGMAN_JOB_ID + " = " +
				dm.DAGManJobId._cluster;
	args.AppendArg( dagJobIdMacro.Value() );

	args.AppendArg( "-a" );
	MyString submitEventNotes = MyString(
				"submit_event_notes = DAG Node: " ) + DAGNodeName;
	args.AppendArg( submitEventNotes.Value() );

	ASSERT( workflowLogFile );

		// We need to append the DAGman default log file to
		// the log file list
	args.AppendArg( "-a" );
	std::string dlog( "dagman_log = " );
	dlog += workflowLogFile;
	args.AppendArg( dlog.c_str() );
	debug_printf( DEBUG_VERBOSE, "Adding a DAGMan workflow log %s\n",
				workflowLogFile );

		// Now append the mask
	debug_printf( DEBUG_VERBOSE, "Masking the events recorded in the DAGMAN workflow log\n" );
	args.AppendArg( "-a" );
	std::string dmask("+");
	dmask += ATTR_DAGMAN_WORKFLOW_MASK;
	dmask += " = \"";
	const char *eventMask = getEventMask();
	debug_printf( DEBUG_VERBOSE, "Mask for workflow log is %s\n",
				eventMask );
	dmask += eventMask;
	dmask += "\"";
	args.AppendArg( dmask.c_str() );

		// Suppress the job's log file if that option is enabled.
	if ( dm._suppressJobLogs ) {
		debug_printf( DEBUG_VERBOSE, "Suppressing node job log file\n" );
		args.AppendArg( "-a" );
		args.AppendArg( "log = ''" );
	}

	ArgList parentNameArgs;
	parentNameArgs.AppendArg( "-a" );
	MyString parentNodeNames = MyString( "+DAGParentNodeNames = " ) +
	                        "\"" + DAGParentNodeNames + "\"";
	parentNameArgs.AppendArg( parentNodeNames.Value() );

		// set any VARS specified in the DAG file
	MyString anotherLine;
	ListIterator<Job::NodeVar> varsIter(*vars);
	Job::NodeVar nodeVar;
	while ( varsIter.Next(nodeVar) ) {

			// Substitute the node retry count if necessary.  Note that
			// we can't do this in Job::ResolveVarsInterpolations()
			// because that's only called at parse time.
		MyString value = nodeVar._value;
		MyString retryStr( retry );
		value.replaceString( "$(RETRY)", retryStr.Value() );
		MyString varStr = nodeVar._name + " = " + value;

		args.AppendArg( "-a" );
		args.AppendArg( varStr.Value() );
	}

		// Set the special DAG_STATUS variable (mainly for use by
		// "final" nodes).
	args.AppendArg( "-a" );
	MyString var = "DAG_STATUS = ";
	var += dm.dag->_dagStatus;
	args.AppendArg( var.Value() );

		// Set the special FAILED_COUNT variable (mainly for use by
		// "final" nodes).
	args.AppendArg( "-a" );
	var = "FAILED_COUNT = ";
	var += dm.dag->NumNodesFailed();
	args.AppendArg( var.Value() );

		// how big is the command line so far
	MyString display;
	args.GetArgsStringForDisplay( &display );
	int cmdLineSize = display.Length();

	parentNameArgs.GetArgsStringForDisplay( &display );
	int DAGParentNodeNamesLen = display.Length();
		// how many additional chars must we still add to command line
	        // NOTE: according to the POSIX spec, the args +
   	        // environ given to exec() cannot exceed
   	        // _POSIX_ARG_MAX, so we also need to calculate & add
   	        // the size of environ** to reserveNeeded
	int reserveNeeded = strlen( cmdFile );
	int maxCmdLine = _POSIX_ARG_MAX;

		// if we don't have room for DAGParentNodeNames, leave it unset
	if( cmdLineSize + reserveNeeded + DAGParentNodeNamesLen > maxCmdLine ) {
		debug_printf( DEBUG_NORMAL, "Warning: node %s has too many parents "
					  "to list in its classad; leaving its DAGParentNodeNames "
					  "attribute undefined\n", DAGNodeName );
		check_warning_strictness( DAG_STRICT_3 );
	} else {
		args.AppendArgsFromArgList( parentNameArgs );
	}

	if( hold_claim ){
		args.AppendArg( "-a" );
		MyString holdit = MyString("+") + MyString(ATTR_JOB_KEEP_CLAIM_IDLE) + " = "
			+ dm._claim_hold_time;
		args.AppendArg( holdit.Value() );	
	}
	
	if (dm._submitDagDeepOpts.suppress_notification) {
		args.AppendArg( "-a" );
		MyString notify = MyString("notification = never");
		args.AppendArg( notify.Value() );
	}

	args.AppendArg( cmdFile );

	bool success = do_submit( args, condorID, dm.prohibitMultiJobs );

	if ( !tmpDir.Cd2MainDir( errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to original directory: %s\n",
				errMsg.Value() );
		success = false;
	}

	return success;
}
Пример #17
0
void condor_event_timer () {

	ASSERT( dagman.dag != NULL );

    //------------------------------------------------------------------------
    // Proceed with normal operation
    //
    // At this point, the DAG is bootstrapped.  All jobs premarked DONE
    // are in a STATUS_DONE state, and all their children have been
    // marked ready to submit.
    //
    // If recovery was needed, the log file has been completely read and
    // we are ready to proceed with jobs yet unsubmitted.
    //------------------------------------------------------------------------

	if( dagman.paused == true ) {
		debug_printf( DEBUG_DEBUG_1, "(DAGMan paused)\n" );
		return;
	}

    static int prevJobsDone = 0;
    static int prevJobs = 0;
    static int prevJobsFailed = 0;
    static int prevJobsSubmitted = 0;
    static int prevJobsReady = 0;
    static int prevScriptRunNodes = 0;
    static int prevJobsHeld = 0;

	int justSubmitted;
	justSubmitted = dagman.dag->SubmitReadyJobs(dagman);
	if( justSubmitted ) {
			// Note: it would be nice to also have the proc submit
			// count here.  wenger, 2006-02-08.
		debug_printf( DEBUG_VERBOSE, "Just submitted %d job%s this cycle...\n",
				  	justSubmitted, justSubmitted == 1 ? "" : "s" );
	}

	// If the log has grown
	if( dagman.dag->DetectCondorLogGrowth() ) {
		if( dagman.dag->ProcessLogEvents( CONDORLOG ) == false ) {
			debug_printf( DEBUG_NORMAL,
						"ProcessLogEvents(CONDORLOG) returned false\n" );
			dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
			main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR );
			return;
		}
	}

	if( dagman.dag->DetectDaPLogGrowth() ) {
		if( dagman.dag->ProcessLogEvents( DAPLOG ) == false ) {
			debug_printf( DEBUG_NORMAL,
						"ProcessLogEvents(DAPLOG) returned false\n" );
			dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
			main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR );
			return;
		}
	}

    // print status if anything's changed (or we're in a high debug level)
    if( prevJobsDone != dagman.dag->NumNodesDone( true )
        || prevJobs != dagman.dag->NumNodes( true )
        || prevJobsFailed != dagman.dag->NumNodesFailed()
        || prevJobsSubmitted != dagman.dag->NumJobsSubmitted()
        || prevJobsReady != dagman.dag->NumNodesReady()
        || prevScriptRunNodes != dagman.dag->ScriptRunNodeCount()
		|| prevJobsHeld != dagman.dag->NumHeldJobProcs()
		|| DEBUG_LEVEL( DEBUG_DEBUG_4 ) ) {
		print_status();

        prevJobsDone = dagman.dag->NumNodesDone( true );
        prevJobs = dagman.dag->NumNodes( true );
        prevJobsFailed = dagman.dag->NumNodesFailed();
        prevJobsSubmitted = dagman.dag->NumJobsSubmitted();
        prevJobsReady = dagman.dag->NumNodesReady();
        prevScriptRunNodes = dagman.dag->ScriptRunNodeCount();
		prevJobsHeld = dagman.dag->NumHeldJobProcs();
		
		if( dagman.dag->GetDotFileUpdate() ) {
			dagman.dag->DumpDotFile();
		}
	}

	dagman.dag->DumpNodeStatus( false, false );

    ASSERT( dagman.dag->NumNodesDone( true ) + dagman.dag->NumNodesFailed()
			<= dagman.dag->NumNodes( true ) );

    //
    // If DAG is complete, hurray, and exit.
    //
    if( dagman.dag->DoneSuccess( true ) ) {
        ASSERT( dagman.dag->NumJobsSubmitted() == 0 );
		dagman.dag->CheckAllJobs();
        debug_printf( DEBUG_NORMAL, "All jobs Completed!\n" );
		dagman.dag->PrintDeferrals( DEBUG_NORMAL, true );
		if ( dagman.dag->NumIdleJobProcs() != 0 ) {
			debug_printf( DEBUG_NORMAL, "Warning:  DAGMan thinks there "
						"are %d idle jobs, even though the DAG is "
						"completed!\n", dagman.dag->NumIdleJobProcs() );
			check_warning_strictness( DAG_STRICT_1 );
		}
		ExitSuccess();
		return;
    }

	//
	// DAG has failed -- dump rescue DAG.
	//
    if( dagman.dag->DoneFailed( true ) ) {
		main_shutdown_rescue( EXIT_ERROR, dagman.dag->_dagStatus );
		return;
	}

	//
	// DAG has succeeded but we haven't run final node yet, so do that.
	//
    if( dagman.dag->DoneSuccess( false ) ) {
		dagman.dag->StartFinalNode();
		return;
	}

		// If the DAG is halted, we don't want to actually exit yet if
		// jobs are still in the queue, or any POST scripts need to be
		// run (we need to run POST scripts so we don't "waste" jobs
		// that completed; on the other hand, we don't care about waiting
		// for PRE scripts because they'll be re-run when the rescue
		// DAG is run anyhow).
	if ( dagman.dag->IsHalted() && dagman.dag->NumJobsSubmitted() == 0 &&
				dagman.dag->PostRunNodeCount() == 0 &&
				!dagman.dag->RunningFinalNode() ) {
		debug_printf ( DEBUG_QUIET, "Exiting because DAG is halted "
					"and no jobs or scripts are running\n" );
		main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_HALTED );
		return;
	}

    //
    // If no jobs are submitted and no scripts are running, but the
    // dag is not complete, then at least one job failed, or a cycle
    // exists.  (Note that if the DAG completed successfully, we already
	// returned from this function above.)
    // 
    if( dagman.dag->FinishedRunning( false ) ) {
		Dag::dag_status dagStatus = Dag::DAG_STATUS_OK;
		if( dagman.dag->DoneFailed( false ) ) {
			if( DEBUG_LEVEL( DEBUG_QUIET ) ) {
				debug_printf( DEBUG_QUIET,
							  "ERROR: the following job(s) failed:\n" );
				dagman.dag->PrintJobList( Job::STATUS_ERROR );
			}
			dagStatus = Dag::DAG_STATUS_NODE_FAILED;
		} else {
			// no jobs failed, so a cycle must exist
			debug_printf( DEBUG_QUIET, "ERROR: DAG finished but not all "
						"nodes are complete -- checking for a cycle...\n" );
			if( dagman.dag->isCycle() ) {
				debug_printf (DEBUG_QUIET, "... ERROR: a cycle exists "
							"in the dag, please check input\n");
				dagStatus = Dag::DAG_STATUS_CYCLE;
			} else {
				debug_printf (DEBUG_QUIET, "... ERROR: no cycle found; "
							"unknown error condition\n");
				dagStatus = Dag::DAG_STATUS_ERROR;
			}
			if ( debug_level >= DEBUG_NORMAL ) {
				dagman.dag->PrintJobList();
			}
		}

		main_shutdown_rescue( EXIT_ERROR, dagStatus );
		return;
    }
}
Пример #18
0
	// 
	// In Config() we get DAGMan-related configuration values.  This
	// is a three-step process:
	// 1. Get the name of the DAGMan-specific config file (if any).
	// 2. If there is a DAGMan-specific config file, process it so
	//    that its values are added to the configuration.
	// 3. Get the values we want from the configuration.
	//
bool
Dagman::Config()
{
	int debug_cache_size = (1024*1024)*5; // 5 MB
	bool debug_cache_enabled = false;

		// Note: debug_printfs are DEBUG_NORMAL here because when we
		// get here we haven't processed command-line arguments yet.

		// Get and process the DAGMan-specific config file (if any)
		// before getting any of the other parameters.
	_dagmanConfigFile = param( "DAGMAN_CONFIG_FILE" );
	if ( _dagmanConfigFile ) {
		debug_printf( DEBUG_NORMAL, "Using DAGMan config file: %s\n",
					_dagmanConfigFile );
			// We do this test here because the corresponding error
			// message from the config code doesn't show up in dagman.out.
		if ( access( _dagmanConfigFile, R_OK ) != 0 &&
					!is_piped_command( _dagmanConfigFile ) ) {
			debug_printf( DEBUG_QUIET,
						"ERROR: Can't read DAGMan config file: %s\n",
						_dagmanConfigFile );
    		DC_Exit( EXIT_ERROR );
		}
		process_config_source( _dagmanConfigFile, "DAGMan config",
					NULL, true );
	}

	_strict = (strict_level_t)param_integer( "DAGMAN_USE_STRICT",
				_strict, DAG_STRICT_0, DAG_STRICT_3 );
	debug_printf( DEBUG_NORMAL, "DAGMAN_USE_STRICT setting: %d\n",
				_strict );

	debug_level = (debug_level_t)param_integer( "DAGMAN_VERBOSITY",
				debug_level, DEBUG_SILENT, DEBUG_DEBUG_4 );
	debug_printf( DEBUG_NORMAL, "DAGMAN_VERBOSITY setting: %d\n",
				debug_level );

	debug_cache_size = 
		param_integer( "DAGMAN_DEBUG_CACHE_SIZE", debug_cache_size,
		0, INT_MAX);
	debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_SIZE setting: %d\n",
				debug_cache_size );

	debug_cache_enabled = 
		param_boolean( "DAGMAN_DEBUG_CACHE_ENABLE", debug_cache_enabled );
	debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_ENABLE setting: %s\n",
				debug_cache_enabled?"True":"False" );

	submit_delay = param_integer( "DAGMAN_SUBMIT_DELAY", submit_delay, 0);
	debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DELAY setting: %d\n",
				submit_delay );

	max_submit_attempts =
		param_integer( "DAGMAN_MAX_SUBMIT_ATTEMPTS", max_submit_attempts,
		1, 16 );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMIT_ATTEMPTS setting: %d\n",
				max_submit_attempts );

	startup_cycle_detect =
		param_boolean( "DAGMAN_STARTUP_CYCLE_DETECT", startup_cycle_detect );
	debug_printf( DEBUG_NORMAL, "DAGMAN_STARTUP_CYCLE_DETECT setting: %s\n",
				startup_cycle_detect ? "True" : "False" );

	max_submits_per_interval =
		param_integer( "DAGMAN_MAX_SUBMITS_PER_INTERVAL",
		max_submits_per_interval, 1, 1000 );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: %d\n",
				max_submits_per_interval );

	m_user_log_scan_interval =
		param_integer( "DAGMAN_USER_LOG_SCAN_INTERVAL",
		m_user_log_scan_interval, 1, INT_MAX);
	debug_printf( DEBUG_NORMAL, "DAGMAN_USER_LOG_SCAN_INTERVAL setting: %d\n",
				m_user_log_scan_interval );
	_defaultPriority = param_integer("DAGMAN_DEFAULT_PRIORITY", 0, INT_MIN,
		INT_MAX, false);
	_submitDagDeepOpts.always_use_node_log = param_boolean( "DAGMAN_ALWAYS_USE_NODE_LOG", true);


		// Event checking setup...

		// We want to default to allowing the terminated/aborted
		// combination (that's what we've defaulted to in the past).
		// Okay, we also want to allow execute before submit because
		// we've run into that, and since DAGMan doesn't really care
		// about the execute events, it shouldn't abort the DAG.
		// And we further want to allow two terminated events for a
		// single job because people are seeing that with Globus
		// jobs!!
	allow_events = CheckEvents::ALLOW_TERM_ABORT |
			CheckEvents::ALLOW_EXEC_BEFORE_SUBMIT |
			CheckEvents::ALLOW_DOUBLE_TERMINATE |
			CheckEvents::ALLOW_DUPLICATE_EVENTS;

		// If the old DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION param is set,
		// we also allow extra runs.
		// Note: this parameter is probably only used by CDF, and only
		// really needed until they update all their systems to 6.7.3
		// or later (not 6.7.3 pre-release), which fixes the "double-run"
		// bug.
	bool allowExtraRuns = param_boolean(
			"DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION", false );

	if ( allowExtraRuns ) {
		allow_events |= CheckEvents::ALLOW_RUN_AFTER_TERM;
		debug_printf( DEBUG_NORMAL, "Warning: "
				"DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION "
				"is deprecated -- used DAGMAN_ALLOW_EVENTS instead\n" );
		check_warning_strictness( DAG_STRICT_1 );
	}

		// Now get the new DAGMAN_ALLOW_EVENTS value -- that can override
		// all of the previous stuff.
	allow_events = param_integer("DAGMAN_ALLOW_EVENTS", allow_events);
	debug_printf( DEBUG_NORMAL, "allow_events ("
				"DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS"
				") setting: %d\n", allow_events );

		// ...end of event checking setup.

	retrySubmitFirst = param_boolean( "DAGMAN_RETRY_SUBMIT_FIRST",
				retrySubmitFirst );
	debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_SUBMIT_FIRST setting: %s\n",
				retrySubmitFirst ? "True" : "False" );

	retryNodeFirst = param_boolean( "DAGMAN_RETRY_NODE_FIRST",
				retryNodeFirst );
	debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_NODE_FIRST setting: %s\n",
				retryNodeFirst ? "True" : "False" );

	maxIdle =
		param_integer( "DAGMAN_MAX_JOBS_IDLE", maxIdle, 0, INT_MAX );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_IDLE setting: %d\n",
				maxIdle );

	maxJobs =
		param_integer( "DAGMAN_MAX_JOBS_SUBMITTED", maxJobs, 0, INT_MAX );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_SUBMITTED setting: %d\n",
				maxJobs );

	maxPreScripts = param_integer( "DAGMAN_MAX_PRE_SCRIPTS", maxPreScripts,
				0, INT_MAX );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_PRE_SCRIPTS setting: %d\n",
				maxPreScripts );

	maxPostScripts = param_integer( "DAGMAN_MAX_POST_SCRIPTS", maxPostScripts,
				0, INT_MAX );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_POST_SCRIPTS setting: %d\n",
				maxPostScripts );

	allowLogError = param_boolean( "DAGMAN_ALLOW_LOG_ERROR", allowLogError );
	debug_printf( DEBUG_NORMAL, "DAGMAN_ALLOW_LOG_ERROR setting: %s\n",
				allowLogError ? "True" : "False" );

	mungeNodeNames = param_boolean( "DAGMAN_MUNGE_NODE_NAMES",
				mungeNodeNames );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MUNGE_NODE_NAMES setting: %s\n",
				mungeNodeNames ? "True" : "False" );

	prohibitMultiJobs = param_boolean( "DAGMAN_PROHIBIT_MULTI_JOBS",
				prohibitMultiJobs );
	debug_printf( DEBUG_NORMAL, "DAGMAN_PROHIBIT_MULTI_JOBS setting: %s\n",
				prohibitMultiJobs ? "True" : "False" );

	submitDepthFirst = param_boolean( "DAGMAN_SUBMIT_DEPTH_FIRST",
				submitDepthFirst );
	debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DEPTH_FIRST setting: %s\n",
				submitDepthFirst ? "True" : "False" );

	_runPost = param_boolean( "DAGMAN_ALWAYS_RUN_POST", true );
	debug_printf( DEBUG_NORMAL, "DAGMAN_ALWAYS_RUN_POST setting: %s\n",
			_runPost ? "True" : "False" );

	free( condorSubmitExe );
	condorSubmitExe = param( "DAGMAN_CONDOR_SUBMIT_EXE" );
	if( !condorSubmitExe ) {
		condorSubmitExe = strdup( "condor_submit" );
		ASSERT( condorSubmitExe );
	}

	free( condorRmExe );
	condorRmExe = param( "DAGMAN_CONDOR_RM_EXE" );
	if( !condorRmExe ) {
		condorRmExe = strdup( "condor_rm" );
		ASSERT( condorRmExe );
	}

	free( storkSubmitExe );
	storkSubmitExe = param( "DAGMAN_STORK_SUBMIT_EXE" );
	if( !storkSubmitExe ) {
		storkSubmitExe = strdup( "stork_submit" );
		ASSERT( storkSubmitExe );
	}

	free( storkRmExe );
	storkRmExe = param( "DAGMAN_STORK_RM_EXE" );
	if( !storkRmExe ) {
		storkRmExe = strdup( "stork_rm" );
		ASSERT( storkRmExe );
	}

	abortDuplicates = param_boolean( "DAGMAN_ABORT_DUPLICATES",
				abortDuplicates );
	debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_DUPLICATES setting: %s\n",
				abortDuplicates ? "True" : "False" );

	abortOnScarySubmit = param_boolean( "DAGMAN_ABORT_ON_SCARY_SUBMIT",
				abortOnScarySubmit );
	debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_ON_SCARY_SUBMIT setting: %s\n",
				abortOnScarySubmit ? "True" : "False" );

	pendingReportInterval = param_integer( "DAGMAN_PENDING_REPORT_INTERVAL",
				pendingReportInterval );
	debug_printf( DEBUG_NORMAL, "DAGMAN_PENDING_REPORT_INTERVAL setting: %d\n",
				pendingReportInterval );

	if ( param_boolean( "DAGMAN_OLD_RESCUE", false ) ) {
		debug_printf( DEBUG_NORMAL, "Warning: DAGMAN_OLD_RESCUE is "
					"no longer supported\n" );
		check_warning_strictness( DAG_STRICT_1 );
	}

	autoRescue = param_boolean( "DAGMAN_AUTO_RESCUE", autoRescue );
	debug_printf( DEBUG_NORMAL, "DAGMAN_AUTO_RESCUE setting: %s\n",
				autoRescue ? "True" : "False" );
	
	maxRescueDagNum = param_integer( "DAGMAN_MAX_RESCUE_NUM",
				maxRescueDagNum, 0, ABS_MAX_RESCUE_DAG_NUM );
	debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_RESCUE_NUM setting: %d\n",
				maxRescueDagNum );

	_writePartialRescueDag = param_boolean( "DAGMAN_WRITE_PARTIAL_RESCUE",
				_writePartialRescueDag );
	debug_printf( DEBUG_NORMAL, "DAGMAN_WRITE_PARTIAL_RESCUE setting: %s\n",
				_writePartialRescueDag ? "True" : "False" );

	free( _defaultNodeLog );
	_defaultNodeLog = param( "DAGMAN_DEFAULT_NODE_LOG" );
	debug_printf( DEBUG_NORMAL, "DAGMAN_DEFAULT_NODE_LOG setting: %s\n",
				_defaultNodeLog ? _defaultNodeLog : "null" );

	_generateSubdagSubmits = 
		param_boolean( "DAGMAN_GENERATE_SUBDAG_SUBMITS",
		_generateSubdagSubmits );
	debug_printf( DEBUG_NORMAL, "DAGMAN_GENERATE_SUBDAG_SUBMITS setting: %s\n",
				_generateSubdagSubmits ? "True" : "False" );

	_maxJobHolds = param_integer( "DAGMAN_MAX_JOB_HOLDS", _maxJobHolds,
				0, 1000000 );
	_claim_hold_time = param_integer( "DAGMAN_HOLD_CLAIM_TIME", _claim_hold_time, 0, 3600);

	char *debugSetting = param( "ALL_DEBUG" );
	debug_printf( DEBUG_NORMAL, "ALL_DEBUG setting: %s\n",
				debugSetting ? debugSetting : "" );
	if ( debugSetting ) {
		free( debugSetting );
	}

	debugSetting = param( "DAGMAN_DEBUG" );
	debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG setting: %s\n",
				debugSetting ? debugSetting : "" );
	if ( debugSetting ) {
		free( debugSetting );
	}

	// enable up the debug cache if needed
	if (debug_cache_enabled) {
		debug_cache_set_size(debug_cache_size);
		debug_cache_enable();
	}

	return true;
}
Пример #19
0
//---------------------------------------------------------------------------
void main_init (int argc, char ** const argv) {

	printf ("Executing condor dagman ... \n");

		// flag used if DAGMan is invoked with -WaitForDebug so we
		// wait for a developer to attach with a debugger...
	volatile int wait_for_debug = 0;

		// process any config vars -- this happens before we process
		// argv[], since arguments should override config settings
	dagman.Config();

	// The DCpermission (last parm) should probably be PARENT, if it existed
    daemonCore->Register_Signal( SIGUSR1, "SIGUSR1",
                                 (SignalHandler) main_shutdown_remove,
                                 "main_shutdown_remove", NULL);

/****** FOR TESTING *******
    daemonCore->Register_Signal( SIGUSR2, "SIGUSR2",
                                 (SignalHandler) main_testing_stub,
                                 "main_testing_stub", NULL);
****** FOR TESTING ********/
    debug_progname = condor_basename(argv[0]);

		// condor_submit_dag version from .condor.sub
	bool allowVerMismatch = false;
	const char *csdVersion = "undefined";

	int i;
    for (i = 0 ; i < argc ; i++) {
        debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] );
    }

    if (argc < 2) Usage();  //  Make sure an input file was specified

		// get dagman job id from environment, if it's there
		// (otherwise it will be set to "-1.-1.-1")
	dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) );

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		// Minimum legal version for a .condor.sub file to be compatible
		// with this condor_dagman binary.

		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		// Be sure to change this if the arguments or environment
		// passed to condor_dagman change in an incompatible way!!
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	struct DagVersionData {
		int majorVer;
		int minorVer;
		int subMinorVer;
	};
	const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 };

		// Construct a string of the minimum submit file version.
	MyString minSubmitVersionStr;
	minSubmitVersionStr.formatstr( "%d.%d.%d",
				MIN_SUBMIT_FILE_VERSION.majorVer,
				MIN_SUBMIT_FILE_VERSION.minorVer,
				MIN_SUBMIT_FILE_VERSION.subMinorVer );
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    //
    // Process command-line arguments
    //
    for (i = 1; i < argc; i++) {
        if( !strcasecmp( "-Debug", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No debug level specified\n" );
                Usage();
            }
            debug_level = (debug_level_t) atoi (argv[i]);
        } else if( !strcasecmp( "-Lockfile", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" );
                Usage();
            }
            lockFileName = argv[i];
        } else if( !strcasecmp( "-Help", argv[i] ) ) {
            Usage();
        } else if (!strcasecmp( "-Dag", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DAG specified\n" );
                Usage();
            }
			dagman.dagFiles.append( argv[i] );
        } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxIdle\n" );
                Usage();
            }
            dagman.maxIdle = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxJobs\n" );
                Usage();
            }
            dagman.maxJobs = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) {
			debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with "
						   "-MaxPre and -MaxPost arguments\n" );
			Usage();
        } else if( !strcasecmp( "-MaxPre", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPre\n" );
                Usage();
            }
            dagman.maxPreScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxPost", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPost\n" );
                Usage();
            }
            dagman.maxPostScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) {
			debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is "
						"ignored; please use the DAGMAN_ALLOW_EVENTS "
						"config parameter instead\n");
			check_warning_strictness( DAG_STRICT_1 );

        } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) {
			dagman.allowLogError = true;

        } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) {
			dagman._runPost = false;

        } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) {
			wait_for_debug = 1;

        } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) {
			dagman.useDagDir = true;

        } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" );
                Usage();
            }
            dagman.autoRescue = (atoi( argv[i] ) != 0);

        } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" );
                Usage();
            }
            dagman.doRescueFrom = atoi (argv[i]);

        } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" );
                Usage();
            }
			csdVersion = argv[i];

        } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) {
			allowVerMismatch = true;

        } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) {
			dagman.dumpRescueDag = true;

        } else if( !strcasecmp( "-verbose", argv[i] ) ) {
			dagman._submitDagDeepOpts.bVerbose = true;

        } else if( !strcasecmp( "-force", argv[i] ) ) {
			dagman._submitDagDeepOpts.bForce = true;
		
        } else if( !strcasecmp( "-notification", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No notification value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strNotification = argv[i];

        } else if( !strcasecmp( "-dagman", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No dagman value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strDagmanPath = argv[i];

        } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strOutfileDir = argv[i];

        } else if( !strcasecmp( "-update_submit", argv[i] ) ) {
			dagman._submitDagDeepOpts.updateSubmit = true;

        } else if( !strcasecmp( "-import_env", argv[i] ) ) {
			dagman._submitDagDeepOpts.importEnv = true;

        } else if( !strcasecmp( "-priority", argv[i] ) ) {
		++i;
		if( i >= argc || strcmp( argv[i], "" ) == 0 ) {
			debug_printf( DEBUG_NORMAL, "No priority value specified\n");
			Usage();
		}
		dagman._submitDagDeepOpts.priority = atoi(argv[i]);
		} else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) {
			dagman._submitDagDeepOpts.always_use_node_log = false;
        } else {
    		debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n",
						argv[i] );
			Usage();
		}
    }

	dagman.dagFiles.rewind();
	dagman.primaryDagFile = dagman.dagFiles.next();
	dagman.multiDags = (dagman.dagFiles.number() > 1);

	MyString tmpDefaultLog;
	if ( dagman._defaultNodeLog != NULL ) {
		tmpDefaultLog = dagman._defaultNodeLog;
		free( dagman._defaultNodeLog );
	} else {
		tmpDefaultLog = dagman.primaryDagFile + ".nodes.log";
	}

		// Force default log file path to be absolute so it works
		// with -usedagdir and DIR nodes.
	CondorError errstack;
	if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) {
       	debug_printf( DEBUG_QUIET, "Unable to convert default log "
					"file name to absolute path: %s\n",
					errstack.getFullText().c_str() );
		dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR );
		DC_Exit( EXIT_ERROR );
	}
	dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() );
	debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n",
				dagman._defaultNodeLog);

    //
    // Check the arguments
    //

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// Checking for version compatibility between the .condor.sub
	// file and this condor_dagman binary...

	// Note: if we're in recovery mode and the submit file version
	// causes us to quit, we leave any existing node jobs still
	// running -- may want to change that eventually.  wenger 2009-10-13.

		// Version of the condor_submit_dag that created our submit file.
	CondorVersionInfo submitFileVersion( csdVersion );

		// Version of this condor_dagman binary.
	CondorVersionInfo dagmanVersion;

		// Just generate this message fragment in one place.
	MyString versionMsg;
	versionMsg.formatstr("the version (%s) of this DAG's Condor submit "
				"file (created by condor_submit_dag)", csdVersion );

		// Make sure version in submit file is valid.
	if( !submitFileVersion.is_valid() ) {
		if ( !allowVerMismatch ) {
        	debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n",
						versionMsg.Value() );
			DC_Exit( EXIT_ERROR );
		} else {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; "
						"continuing because of -AllowVersionMismatch flag\n",
						versionMsg.Value() );
		}

		// Make sure .condor.sub file is recent enough.
	} else if ( submitFileVersion.compare_versions(
				CondorVersion() ) != 0 ) {

		if( !submitFileVersion.built_since_version(
					MIN_SUBMIT_FILE_VERSION.majorVer,
					MIN_SUBMIT_FILE_VERSION.minorVer,
					MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) {
			if ( !allowVerMismatch ) {
        		debug_printf( DEBUG_QUIET, "Error: %s is older than "
							"oldest permissible version (%s)\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
				DC_Exit( EXIT_ERROR );
			} else {
        		debug_printf( DEBUG_NORMAL, "Warning: %s is older than "
							"oldest permissible version (%s); continuing "
							"because of -AllowVersionMismatch flag\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
			}

			// Warn if .condor.sub file is a newer version than this binary.
		} else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is newer than "
						"condor_dagman version (%s)\n", versionMsg.Value(),
						CondorVersion() );
			check_warning_strictness( DAG_STRICT_3 );
		} else {
        	debug_printf( DEBUG_NORMAL, "Note: %s differs from "
						"condor_dagman version (%s), but the "
						"difference is permissible\n", 
						versionMsg.Value(), CondorVersion() );
		}
	}
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    if( dagman.primaryDagFile == "" ) {
        debug_printf( DEBUG_SILENT, "No DAG file was specified\n" );
        Usage();
    }
    if (lockFileName == NULL) {
        debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" );
        Usage();
    }
    if( dagman.maxJobs < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n");
        Usage();
    }
    if( dagman.maxPreScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" );
        Usage();
    }
    if( dagman.maxPostScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" );
        Usage();
    }
    if( dagman.doRescueFrom < 0 ) {
        debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" );
        Usage();
    }

    debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n",
                   lockFileName );
	if ( dagman.dagFiles.number() == 1 ) {
    	debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n",
				  	dagman.primaryDagFile.Value() );
	} else {
		MyString msg = "DAG Input files are ";
		dagman.dagFiles.rewind();
		const char *dagFile;
		while ( (dagFile = dagman.dagFiles.next()) != NULL ) {
			msg += dagFile;
			msg += " ";
		}
		msg += "\n";
    	debug_printf( DEBUG_VERBOSE, "%s", msg.Value() );
	}

		// if requested, wait for someone to attach with a debugger...
	while( wait_for_debug ) { }

    {
		MyString cwd;
		if( !condor_getcwd(cwd) ) {
			cwd = "<null>";
		}
        debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value());

		char *temp = my_username();
		debug_printf( DEBUG_DEBUG_1, "Current user is %s\n",
					   temp ? temp : "<null>" );
		if( temp ) {
			free( temp );
		}
    }

		//
		// Figure out the rescue DAG to run, if any (this is with "new-
		// style" rescue DAGs).
		//
	int rescueDagNum = 0;
	MyString rescueDagMsg;

	if ( dagman.doRescueFrom != 0 ) {
		rescueDagNum = dagman.doRescueFrom;
		rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum );
		RenameRescueDagsAfter( dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum );

	} else if ( dagman.autoRescue ) {
		rescueDagNum = FindLastRescueDagNum(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum );
		rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum );
	}

		//
		// Fill in values in the deep submit options that we haven't
		// already set.
		//
	dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError;
	dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir;
	dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue;
	dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom;
	dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch;
	dagman._submitDagDeepOpts.recurse = false;

    //
    // Create the DAG
    //

	// Note: a bunch of the parameters we pass here duplicate things
	// in submitDagOpts, but I'm keeping them separate so we don't have to
	// bother to construct a new SubmitDagOtions object for splices.
	// wenger 2010-03-25
    dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs,
						  dagman.maxPreScripts, dagman.maxPostScripts,
						  dagman.allowLogError, dagman.useDagDir,
						  dagman.maxIdle, dagman.retrySubmitFirst,
						  dagman.retryNodeFirst, dagman.condorRmExe,
						  dagman.storkRmExe, &dagman.DAGManJobId,
						  dagman.prohibitMultiJobs, dagman.submitDepthFirst,
						  dagman._defaultNodeLog,
						  dagman._generateSubdagSubmits,
						  &dagman._submitDagDeepOpts,
						  false ); /* toplevel dag! */

    if( dagman.dag == NULL ) {
        EXCEPT( "ERROR: out of memory!\n");
    }

	dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit );
	dagman.dag->SetAllowEvents( dagman.allow_events );
	dagman.dag->SetConfigFile( dagman._dagmanConfigFile );
	dagman.dag->SetMaxJobHolds( dagman._maxJobHolds );
	dagman.dag->SetPostRun(dagman._runPost);
	if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line
		dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority);
	} else if( dagman._defaultPriority != 0 ) { // From config file
		dagman.dag->SetDefaultPriority(dagman._defaultPriority);
		dagman._submitDagDeepOpts.priority = dagman._defaultPriority;
	}

    //
    // Parse the input files.  The parse() routine
    // takes care of adding jobs and dependencies to the DagMan
    //
	dagman.mungeNodeNames = (dagman.dagFiles.number() > 1);
	parseSetDoNameMunge( dagman.mungeNodeNames );
   	debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", 
		dagman.dagFiles.number() );
	dagman.dagFiles.rewind();
	char *dagFile;

	// Here we make a copy of the dagFiles for iteration purposes. Deep inside
	// of the parsing, copies of the dagman.dagFile string list happen which
	// mess up the iteration of this list.
	StringList sl( dagman.dagFiles );
	sl.rewind();
	while ( (dagFile = sl.next()) != NULL ) {
    	debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile );

    	if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							false, true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored.
			unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}
	if( dagman.dag->GetDefaultPriority() != 0 ) {
		dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag
	}
	dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId );
	if ( rescueDagNum > 0 ) {
			// Get our Pegasus sequence numbers set correctly.
		dagman.dag->GetJobstateLog().InitializeRescue();
	}

	// lift the final set of splices into the main dag.
	dagman.dag->LiftSplices(SELF);

		//
		// Actually parse the "new-new" style (partial DAG info only)
		// rescue DAG here.  Note: this *must* be done after splices
		// are lifted!
		//
	if ( rescueDagNum > 0 ) {
		dagman.rescueFileToRun = RescueDagName(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum );
		debug_printf ( DEBUG_QUIET, "%s; running %s in combination with "
					"normal DAG file%s\n", rescueDagMsg.Value(),
					dagman.rescueFileToRun.Value(),
					dagman.multiDags ? "s" : "");
		debug_printf ( DEBUG_QUIET,
					"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");

		debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n",
					dagman.rescueFileToRun.Value() );

			// Turn off node name munging for the rescue DAG, because
			// it will already have munged node names.
		parseSetDoNameMunge( false );

    	if( !parse( dagman.dag, dagman.rescueFileToRun.Value(),
					dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored.
			unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}

	dagman.dag->CheckThrottleCats();

	// fix up any use of $(JOB) in the vars values for any node
	dagman.dag->ResolveVarsInterpolations();

/*	debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/
/*	dagman.dag->PrintJobList();*/

#ifndef NOT_DETECT_CYCLE
	if( dagman.startup_cycle_detect && dagman.dag->isCycle() )
	{
		// Note: maybe we should run the final node here, if there is one.
		// wenger 2011-12-19.
		debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n");
	}
#endif
    debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n",
				  dagman.dag->NumNodes( true ) );

	MyString firstLocation;
	if ( dagman.dag->GetReject( firstLocation ) ) {
    	debug_printf( DEBUG_QUIET, "Exiting because of REJECT "
					"specification in %s.  This most likely means "
					"that the DAG file was produced with the -DumpRescue "
					"flag when parsing the original DAG failed.\n",
					firstLocation.Value() );
		DC_Exit( EXIT_ERROR );
		return;
	}

	dagman.dag->DumpDotFile();

	if ( dagman.dumpRescueDag ) {
    	debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting "
					"because of -DumpRescue flag\n" );
		dagman.dag->Rescue( dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum, false,
					false, false );
		ExitSuccess();
		return;
	}

    //------------------------------------------------------------------------
    // Bootstrap and Recovery
    //
    // If the Lockfile exists, this indicates a premature termination
    // of a previous run of Dagman. If condor log is also present,
    // we run in recovery mode
  
    // If the Daglog is not present, then we do not run in recovery
    // mode
  
    {
      bool recovery = access(lockFileName,  F_OK) == 0;
      
        if (recovery) {
            debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n",
                           lockFileName);
			if (dagman.abortDuplicates) {
				if (util_check_lock_file(lockFileName) == 1) {
        			debug_printf( DEBUG_QUIET, "Aborting because it "
							"looks like another instance of DAGMan is "
							"currently running on this DAG; if that is "
							"not the case, delete the lock file (%s) "
							"and re-submit the DAG.\n", lockFileName );
					dagman.dag->GetJobstateLog().
								WriteDagmanFinished( EXIT_RESTART );
    				dagman.CleanUp();
					DC_Exit( EXIT_ERROR );
					// We should never get to here!
				}
			}
        }

			//
			// If this DAGMan continues, it should overwrite the lock
			// file if it exists.
			//
		util_create_lock_file(lockFileName, dagman.abortDuplicates);

        debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n");
        if( !dagman.dag->Bootstrap( recovery ) ) {
            dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
            debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n");
        }
    }

    debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" );
    daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, 
				condor_event_timer, "condor_event_timer" );

	dagman.dag->SetPendingNodeReportInterval(
				dagman.pendingReportInterval );
}
Пример #20
0
int util_create_lock_file(const char *lockFileName, bool abortDuplicates) {
	int result = 0;

	FILE *fp = safe_fopen_wrapper_follow( lockFileName, "w" );
	if ( fp == NULL ) {
		debug_printf( DEBUG_QUIET,
					"ERROR: could not open lock file %s for writing.\n",
					lockFileName);
		result = -1;
	}

		//
		// Create the ProcessId object.
		//
	ProcessId *procId = NULL;
	if ( result == 0 && abortDuplicates ) {
		int status;
		int precision_range = 1;
		if ( ProcAPI::createProcessId( daemonCore->getpid(), procId,
					status, &precision_range ) != PROCAPI_SUCCESS ) {
			debug_printf( DEBUG_QUIET, "ERROR: ProcAPI::createProcessId() "
						"failed; %d\n", status );
			result = -1;
		}
	}

		//
		// Write out the ProcessId object.
		//
	if ( result == 0 && abortDuplicates ) {
		if ( procId->write( fp ) != ProcessId::SUCCESS ) {
			debug_printf( DEBUG_QUIET, "ERROR: ProcessId::write() failed\n");
			result = -1;
		}
	}

		//
		// Sleep to ensure uniqueness of the ProcessId object.
		//
	if ( result == 0 && abortDuplicates ) {
		const int maxSleepTime = 60; // seconds; arbitrarily chosen
		int sleepTime = procId->computeWaitTime();

		if ( sleepTime > maxSleepTime ) {
			debug_printf( DEBUG_QUIET, "Warning: ProcessId computed sleep "
						"time (%d) exceeds maximum (%d); skipping sleep/"
						"confirm step\n", sleepTime, maxSleepTime );
			check_warning_strictness( DAG_STRICT_3 );
		} else {
			debug_printf( DEBUG_NORMAL, "Sleeping for %d seconds to "
						"ensure ProcessId uniqueness\n", sleepTime );

#if defined(WIN32)
			sleep( sleepTime );
#else
			while( (sleepTime = sleep( sleepTime ) ) != 0 ) { }
#endif

				//
				// Confirm the ProcessId object's uniqueness.
				//
			int status;
			if ( ProcAPI::confirmProcessId( *procId, status ) !=
						PROCAPI_SUCCESS ) {
				debug_printf( DEBUG_QUIET, "Warning: ProcAPI::"
							"confirmProcessId() failed; %d\n", status );
				check_warning_strictness( DAG_STRICT_3 );
			} else {
				if ( !procId->isConfirmed() ) {
					debug_printf( DEBUG_QUIET, "Warning: ProcessId not "
								"confirmed unique\n" );
					check_warning_strictness( DAG_STRICT_3 );
				} else {

						//
						// Write out the confirmation.
						//
					if ( procId->writeConfirmationOnly( fp ) !=
								ProcessId::SUCCESS ) {
						debug_printf( DEBUG_QUIET, "ERROR: ProcessId::"
									"writeConfirmationOnly() failed\n");
						result = -1;
					}
				}
			}
		}
	}

	delete procId;

	if ( fp != NULL ) {
		if ( fclose( fp ) != 0 ) {
			debug_printf( DEBUG_QUIET, "ERROR: closing lock "
						"file failed with errno %d (%s)\n", errno,
						strerror( errno ) );
		}
	}

	return result;
}
Пример #21
0
//---------------------------------------------------------------------------
bool
Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader,
			ReadMultipleUserLogs &storkLogReader, bool nfsIsError,
			bool recovery, const char *defaultNodeLog, bool usingDefault )
{
	debug_printf( DEBUG_DEBUG_2,
				"Attempting to monitor log file for node %s\n",
				GetJobName() );

	if ( _logIsMonitored ) {
		debug_printf( DEBUG_DEBUG_1, "Warning: log file for node "
					"%s is already monitored\n", GetJobName() );
		return true;
	}

	ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ?
				condorLogReader : storkLogReader;

    std::string logFileStr;
	if ( _jobType == TYPE_CONDOR ) {
			// We check to see if the user has specified a log file
			// If not, we give him a default
    	MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile,
					_directory, _logFileIsXml, usingDefault);
		logFileStr = templogFileStr.Value();
	} else {
		StringList logFiles;
		MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile(
					_cmdFile, _directory, logFiles );
		if ( tmpResult != "" ) {
			debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n",
						tmpResult.Value() );
			LogMonitorFailed();
			return false;
		} else if ( logFiles.number() != 1 ) {
			debug_printf( DEBUG_QUIET, "Error: %d Stork log files found "
						"in submit file %s; we want 1\n",
						logFiles.number(), _cmdFile );
			LogMonitorFailed();
			return false;
		} else {
			logFiles.rewind();
			logFileStr = logFiles.next();
		}
	}

		// Warn the user if the node's log file is in /tmp.
	if ( logFileStr.find( "/tmp" ) == 0 ) {
		debug_printf( DEBUG_QUIET, "Warning: "
					"Log file %s for node %s is in /tmp\n",
					logFileStr.c_str(), GetJobName() );
        check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 );
	}

	if ( logFileStr == "" ) {
		logFileStr = defaultNodeLog;
		_useDefaultLog = true;
			// Default User log is never XML
			// This could be specified in the submit file and should be
			// ignored.
		_logFileIsXml = false;
		debug_printf( DEBUG_NORMAL, "Unable to get log file from "
					"submit file %s (node %s); using default (%s)\n",
					_cmdFile, GetJobName(), logFileStr.c_str() );
		append_default_log = false;
	} else {
		append_default_log = usingDefault;
		if( append_default_log ) {
				// DAGman is not going to look at the user-specified log.
				// It will look at the defaultNode log.
			logFileStr = defaultNodeLog;
			_useDefaultLog = false;
			_logFileIsXml = false;
		}
	}

		// This function returns true if the log file is on NFS and
		// that is an error.  If the log file is on NFS, but nfsIsError
		// is false, it prints a warning but returns false.
	if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(),
				nfsIsError ) ) {
		debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n",
					logFileStr.c_str() );
		LogMonitorFailed();
		return false;
	}

	delete [] _logFile;
		// Saving log file here in case submit file gets changed.
	_logFile = strnewp( logFileStr.c_str() );
	debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n",
				GetLogFile(), GetJobName() );
	CondorError errstack;
	if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) {
		errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE,
					"ERROR: Unable to monitor log file for node %s",
					GetJobName() );
		debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() );
		LogMonitorFailed();
		EXCEPT( "Fatal log file monitoring error!\n" );
		return false;
	}

	_logIsMonitored = true;

	return true;
}