//--------------------------------------------------------------------------- // This does only partial parsing -- only what we need for recovery mode // and rescue initialization. bool JobstateLog::ParseLine( MyString &line, time_t ×tamp, MyString &nodeName, int &seqNum ) { line.chomp(); line.Tokenize(); const char* timestampTok = line.GetNextToken( " ", false ); const char* nodeNameTok = line.GetNextToken( " ", false ); (void)line.GetNextToken( " ", false ); // event name (void)line.GetNextToken( " ", false ); // condor id (void)line.GetNextToken( " ", false ); // job tag (pegasus site) (void)line.GetNextToken( " ", false ); // unused const char* seqNumTok = line.GetNextToken( " ", false ); if ( (timestampTok == NULL) || (nodeNameTok == NULL) ) { debug_printf( DEBUG_QUIET, "Warning: error parsing " "jobstate.log file line <%s>\n", line.Value() ); check_warning_strictness( DAG_STRICT_1 ); return false; } // fetch the number, and get a pointer to the first char after // if the pointer did not advance, then there was no number to parse. char *pend; timestamp = (time_t)strtoll(timestampTok, &pend, 10); if (pend == timestampTok) { debug_printf( DEBUG_QUIET, "Warning: error reading " "timestamp in jobstate.log file line <%s>\n", line.Value() ); check_warning_strictness( DAG_STRICT_1 ); return false; } nodeName = nodeNameTok; seqNum = 0; if ( seqNumTok ) { seqNum = (int)strtol(seqNumTok, &pend, 10); if (pend == seqNumTok) { debug_printf( DEBUG_QUIET, "Warning: error reading " "sequence number in jobstate.log file line <%s>\n", line.Value() ); check_warning_strictness( DAG_STRICT_1 ); return false; } } return true; }
void Job::SetCategory( const char *categoryName, ThrottleByCategory &catThrottles ) { MyString tmpName( categoryName ); if ( (_throttleInfo != NULL) && (tmpName != *(_throttleInfo->_category)) ) { debug_printf( DEBUG_NORMAL, "Warning: new category %s for node %s " "overrides old value %s\n", categoryName, GetJobName(), _throttleInfo->_category->Value() ); check_warning_strictness( DAG_STRICT_3 ); } // Note: we must assign a ThrottleInfo here even if the name // already matches, for the case of lifting splices. ThrottleByCategory::ThrottleInfo *oldInfo = _throttleInfo; ThrottleByCategory::ThrottleInfo *throttleInfo = catThrottles.GetThrottleInfo( &tmpName ); if ( throttleInfo != NULL ) { _throttleInfo = throttleInfo; } else { _throttleInfo = catThrottles.AddCategory( &tmpName ); } if ( oldInfo != _throttleInfo ) { if ( oldInfo != NULL ) { oldInfo->_totalJobs--; } _throttleInfo->_totalJobs++; } }
//--------------------------------------------------------------------------- void JobstateLog::WriteEvent( const ULogEvent *event, Job *node ) { if ( !_jobstateLogFile ) { return; } ASSERT( node ); const char *prefix = "ULOG_"; const char *eventName = ULogEventNumberNames[event->eventNumber]; if ( strstr( eventName, prefix ) != eventName ) { debug_printf( DEBUG_QUIET, "Warning: didn't find expected prefix " "%s in event name %s\n", prefix, eventName ); check_warning_strictness( DAG_STRICT_1 ); } else { eventName = eventName + strlen( prefix ); } if ( eventName != NULL ) { MyString condorID; CondorID2Str( event->cluster, event->proc, condorID ); struct tm eventTm = event->eventTime; time_t eventTime = mktime( &eventTm ); Write( &eventTime, node, eventName, condorID.Value() ); } }
bool Job::AddParent( Job* parent, MyString &whynot ) { if( !this->CanAddParent( parent, whynot ) ) { return false; } if( HasParent( parent ) ) { debug_printf( DEBUG_QUIET, "Warning: child %s already has parent %s\n", GetJobName(), parent->GetJobName() ); check_warning_strictness( DAG_STRICT_3 ); return true; } if( !Add( Q_PARENTS, parent->GetJobID() ) ) { whynot = "unknown error appending to PARENTS queue"; return false; } if( parent->GetStatus() != STATUS_DONE ) { if( !Add( Q_WAITING, parent->GetJobID() ) ) { // this node's dependency queues are now out of sync and // thus the DAG state is FUBAR, so we should bail... EXCEPT( "Failed to add parent %s to job %s", parent->GetJobName(), GetJobName() ); return false; } } whynot = "n/a"; return true; }
//--------------------------------------------------------------------------- void DagmanClassad::GetInfo( MyString &owner, MyString &nodeName ) { if ( !_valid ) { debug_printf( DEBUG_VERBOSE, "Skipping ClassAd query -- DagmanClassad object is invalid\n" ); return; } Qmgr_connection *queue = OpenConnection(); if ( !queue ) { return; } if ( !GetDagAttribute( ATTR_OWNER, owner ) ) { check_warning_strictness( DAG_STRICT_1 ); owner = "undef"; } if ( !GetDagAttribute( ATTR_DAG_NODE_NAME, nodeName ) ) { // We should only get this value if we're a sub-DAG. nodeName = "undef"; } CloseConnection( queue ); return; }
//--------------------------------------------------------------------------- DagmanClassad::DagmanClassad( const CondorID &DAGManJobId ) : _valid( false ), _schedd( NULL ) { CondorID defaultCondorId; if ( DAGManJobId == defaultCondorId ) { debug_printf( DEBUG_QUIET, "No HTCondor ID available for DAGMan (running on command line?); DAG status will not be reported to ClassAd\n" ); return; } _dagmanId = DAGManJobId; _schedd = new DCSchedd( NULL, NULL ); if ( !_schedd || !_schedd->locate() ) { const char *errMsg = _schedd ? _schedd->error() : "?"; debug_printf( DEBUG_QUIET, "WARNING: can't find address of local schedd for ClassAd updates (%s)\n", errMsg ); check_warning_strictness( DAG_STRICT_3 ); return; } _valid = true; InitializeMetrics(); }
//--------------------------------------------------------------------------- void DagmanClassad::CloseConnection( Qmgr_connection *queue ) { if ( !DisconnectQ( queue ) ) { debug_printf( DEBUG_QUIET, "WARNING: queue transaction failed. No attributes were set.\n" ); check_warning_strictness( DAG_STRICT_3 ); } }
//--------------------------------------------------------------------------- void DagmanClassad::SetDagAttribute( const char *attrName, const ClassAd &ad ) { if ( SetAttributeExpr( _dagmanId._cluster, _dagmanId._proc, attrName, &ad ) != 0 ) { debug_printf( DEBUG_QUIET, "WARNING: failed to set attribute %s\n", attrName ); check_warning_strictness( DAG_STRICT_3 ); } }
bool AddNode( Dag *dag, const char *name, const char* directory, const char* submitFile, bool noop, bool done, bool isFinal, MyString &failReason ) { MyString why; if( !IsValidNodeName( dag, name, why ) ) { failReason = why; return false; } if( !IsValidSubmitFileName( submitFile, why ) ) { failReason = why; return false; } if( done && isFinal) { failReason.formatstr( "Warning: FINAL Job %s cannot be set to DONE\n", name ); debug_printf( DEBUG_QUIET, "%s", failReason.Value() ); (void)check_warning_strictness( DAG_STRICT_1, false ); done = false; } Job* node = new Job( name, directory, submitFile ); if( !node ) { dprintf( D_ALWAYS, "ERROR: out of memory!\n" ); // we already know we're out of memory, so filling in // FailReason will likely fail, but give it a shot... failReason = "out of memory!"; return false; } node->SetNoop( noop ); if( done ) { node->SetStatus( Job::STATUS_DONE ); } node->SetFinal( isFinal ); ASSERT( dag != NULL ); if( !dag->Add( *node ) ) { failReason = "unknown failure adding "; failReason += isFinal? "Final " : ""; failReason += "node to DAG"; delete node; return false; } failReason = "n/a"; return true; }
//--------------------------------------------------------------------------- Qmgr_connection * DagmanClassad::OpenConnection() { // Open job queue CondorError errstack; Qmgr_connection *queue = ConnectQ( _schedd->addr(), 0, false, &errstack, NULL, _schedd->version() ); if ( !queue ) { debug_printf( DEBUG_QUIET, "WARNING: failed to connect to queue manager (%s)\n", errstack.getFullText().c_str() ); check_warning_strictness( DAG_STRICT_3 ); return NULL; } return queue; }
//--------------------------------------------------------------------------- void Job::TermAbortMetrics( int proc, const struct tm &eventTime, DagmanMetrics *metrics ) { if ( proc >= static_cast<int>( _gotEvents.size() ) ) { debug_printf( DEBUG_NORMAL, "Warning for node %s: got terminated or aborted event for proc %d, but no execute event!\n", GetJobName(), proc ); check_warning_strictness( DAG_STRICT_2 ); _gotEvents.resize( proc+1, 0 ); } if ( !( _gotEvents[proc] & ABORT_TERM_MASK ) ) { _gotEvents[proc] |= ABORT_TERM_MASK; metrics->ProcFinished( eventTime ); } }
//--------------------------------------------------------------------------- void Job::Cleanup() { std::vector<unsigned char> s; _onHold.swap(s); // Free memory in _onHold for ( int proc = 0; proc < static_cast<int>( _gotEvents.size() ); proc++ ) { if ( _gotEvents[proc] != ( EXEC_MASK | ABORT_TERM_MASK ) ) { debug_printf( DEBUG_NORMAL, "Warning for node %s: unexpected _gotEvents value for proc %d: %d!\n", GetJobName(), proc, (int)_gotEvents[proc] ); check_warning_strictness( DAG_STRICT_2 ); } } std::vector<unsigned char> s2; _gotEvents.swap(s2); // Free memory in _gotEvents }
bool Job::AddChild( Job* child, MyString &whynot ) { if( !this->CanAddChild( child, whynot ) ) { return false; } if( HasChild( child ) ) { debug_printf( DEBUG_NORMAL, "Warning: parent %s already has child %s\n", GetJobName(), child->GetJobName() ); check_warning_strictness( DAG_STRICT_3 ); return true; } if( !Add( Q_CHILDREN, child->GetJobID() ) ) { whynot = "unknown error appending to CHILDREN queue"; return false; } whynot = "n/a"; return true; }
//----------------------------------------------------------------------------- int util_popen (ArgList &args) { MyString cmd; // for debug output args.GetArgsStringForDisplay( &cmd ); debug_printf( DEBUG_VERBOSE, "Running: %s\n", cmd.Value() ); FILE *fp = my_popen( args, "r", MY_POPEN_OPT_WANT_STDERR ); int r = 0; if (fp == NULL || (r = my_pclose(fp) & 0xff) != 0) { debug_printf( DEBUG_QUIET, "Warning: failure: %s\n", cmd.Value() ); if( fp != NULL ) { debug_printf ( DEBUG_QUIET, "\t(my_pclose() returned %d (errno %d, %s))\n", r, errno, strerror( errno ) ); } else { debug_printf ( DEBUG_QUIET, "\t(my_popen() returned NULL (errno %d, %s))\n", errno, strerror( errno ) ); r = -1; } check_warning_strictness( DAG_STRICT_1 ); } return r; }
//----------------------------------------------------------------------------- int Script::BackgroundRun( int reaperId, int dagStatus, int failedCount ) { TmpDir tmpDir; MyString errMsg; if ( !tmpDir.Cd2TmpDir( _node->GetDirectory(), errMsg ) ) { debug_printf( DEBUG_QUIET, "Could not change to node directory %s: %s\n", _node->GetDirectory(), errMsg.Value() ); return 0; } // Construct the command line, replacing some tokens with // information about the job. All of these values would probably // be better inserted into the environment, rather than passed on // the command-line... some should be in the job's env as well... const char *delimiters = " \t"; char * token; ArgList args; char * cmd = strnewp(_cmd); for (token = strtok (cmd, delimiters) ; token != NULL ; token = strtok (NULL, delimiters)) { MyString arg; if ( !strcasecmp( token, "$JOB" ) ) { arg += _node->GetJobName(); } else if ( !strcasecmp( token, "$RETRY" ) ) { arg += _node->GetRetries(); } else if ( !strcasecmp( token, "$MAX_RETRIES" ) ) { arg += _node->GetRetryMax(); } else if ( !strcasecmp( token, "$JOBID" ) ) { if ( !_post ) { debug_printf( DEBUG_QUIET, "Warning: $JOBID macro should " "not be used as a PRE script argument!\n" ); check_warning_strictness( DAG_STRICT_1 ); arg += token; } else { arg += _node->_CondorID._cluster; arg += '.'; arg += _node->_CondorID._proc; } } else if (!strcasecmp(token, "$RETURN")) { if ( !_post ) { debug_printf( DEBUG_QUIET, "Warning: $RETURN macro should " "not be used as a PRE script argument!\n" ); check_warning_strictness( DAG_STRICT_1 ); } arg += _retValJob; } else if (!strcasecmp( token, "$PRE_SCRIPT_RETURN" ) ) { if ( !_post ) { debug_printf( DEBUG_QUIET, "Warning: $PRE_SCRIPT_RETURN macro should " "not be used as a PRE script argument!\n" ); check_warning_strictness( DAG_STRICT_1 ); } arg += _retValScript; } else if (!strcasecmp(token, "$DAG_STATUS")) { arg += dagStatus; } else if (!strcasecmp(token, "$FAILED_COUNT")) { arg += failedCount; } else if (token[0] == '$') { // This should probably be a fatal error when -strict is // implemented. debug_printf( DEBUG_QUIET, "Warning: unrecognized macro %s " "in node %s %s script arguments\n", token, _node->GetJobName(), _post ? "POST" : "PRE" ); check_warning_strictness( DAG_STRICT_1 ); arg += token; } else { arg += token; } args.AppendArg(arg.Value()); } _pid = daemonCore->Create_Process( cmd, args, PRIV_UNKNOWN, reaperId, FALSE, NULL, NULL, NULL, NULL, NULL, 0 ); delete [] cmd; if ( !tmpDir.Cd2MainDir( errMsg ) ) { debug_printf( DEBUG_QUIET, "Could not change to original directory: %s\n", errMsg.Value() ); return 0; } return _pid; }
//------------------------------------------------------------------------- bool condor_submit( const Dagman &dm, const char* cmdFile, CondorID& condorID, const char* DAGNodeName, MyString &DAGParentNodeNames, List<Job::NodeVar> *vars, int retry, const char* directory, const char *workflowLogFile, bool hold_claim ) { TmpDir tmpDir; MyString errMsg; if ( !tmpDir.Cd2TmpDir( directory, errMsg ) ) { debug_printf( DEBUG_QUIET, "Could not change to node directory %s: %s\n", directory, errMsg.Value() ); return false; } ArgList args; // construct arguments to condor_submit to add attributes to the // job classad which identify the job's node name in the DAG, the // node names of its parents in the DAG, and the job ID of DAGMan // itself; then, define submit_event_notes to print the job's node // name inside the submit event in the userlog // NOTE: we specify the job ID of DAGMan using only its cluster ID // so that it may be referenced by jobs in their priority // attribute (which needs an int, not a string). Doing so allows // users to effectively "batch" jobs by DAG so that when they // submit many DAGs to the same schedd, all the ready jobs from // one DAG complete before any jobs from another begin. args.AppendArg( dm.condorSubmitExe ); args.AppendArg( "-a" ); MyString nodeName = MyString(ATTR_DAG_NODE_NAME_ALT) + " = " + DAGNodeName; args.AppendArg( nodeName.Value() ); // append a line adding the parent DAGMan's cluster ID to the job ad args.AppendArg( "-a" ); MyString dagJobId = MyString( "+" ) + ATTR_DAGMAN_JOB_ID + " = " + dm.DAGManJobId._cluster; args.AppendArg( dagJobId.Value() ); // now we append a line setting the same thing as a submit-file macro // (this is necessary so the user can reference it in the priority) args.AppendArg( "-a" ); MyString dagJobIdMacro = MyString( "" ) + ATTR_DAGMAN_JOB_ID + " = " + dm.DAGManJobId._cluster; args.AppendArg( dagJobIdMacro.Value() ); args.AppendArg( "-a" ); MyString submitEventNotes = MyString( "submit_event_notes = DAG Node: " ) + DAGNodeName; args.AppendArg( submitEventNotes.Value() ); ASSERT( workflowLogFile ); // We need to append the DAGman default log file to // the log file list args.AppendArg( "-a" ); std::string dlog( "dagman_log = " ); dlog += workflowLogFile; args.AppendArg( dlog.c_str() ); debug_printf( DEBUG_VERBOSE, "Adding a DAGMan workflow log %s\n", workflowLogFile ); // Now append the mask debug_printf( DEBUG_VERBOSE, "Masking the events recorded in the DAGMAN workflow log\n" ); args.AppendArg( "-a" ); std::string dmask("+"); dmask += ATTR_DAGMAN_WORKFLOW_MASK; dmask += " = \""; const char *eventMask = getEventMask(); debug_printf( DEBUG_VERBOSE, "Mask for workflow log is %s\n", eventMask ); dmask += eventMask; dmask += "\""; args.AppendArg( dmask.c_str() ); // Suppress the job's log file if that option is enabled. if ( dm._suppressJobLogs ) { debug_printf( DEBUG_VERBOSE, "Suppressing node job log file\n" ); args.AppendArg( "-a" ); args.AppendArg( "log = ''" ); } ArgList parentNameArgs; parentNameArgs.AppendArg( "-a" ); MyString parentNodeNames = MyString( "+DAGParentNodeNames = " ) + "\"" + DAGParentNodeNames + "\""; parentNameArgs.AppendArg( parentNodeNames.Value() ); // set any VARS specified in the DAG file MyString anotherLine; ListIterator<Job::NodeVar> varsIter(*vars); Job::NodeVar nodeVar; while ( varsIter.Next(nodeVar) ) { // Substitute the node retry count if necessary. Note that // we can't do this in Job::ResolveVarsInterpolations() // because that's only called at parse time. MyString value = nodeVar._value; MyString retryStr( retry ); value.replaceString( "$(RETRY)", retryStr.Value() ); MyString varStr = nodeVar._name + " = " + value; args.AppendArg( "-a" ); args.AppendArg( varStr.Value() ); } // Set the special DAG_STATUS variable (mainly for use by // "final" nodes). args.AppendArg( "-a" ); MyString var = "DAG_STATUS = "; var += dm.dag->_dagStatus; args.AppendArg( var.Value() ); // Set the special FAILED_COUNT variable (mainly for use by // "final" nodes). args.AppendArg( "-a" ); var = "FAILED_COUNT = "; var += dm.dag->NumNodesFailed(); args.AppendArg( var.Value() ); // how big is the command line so far MyString display; args.GetArgsStringForDisplay( &display ); int cmdLineSize = display.Length(); parentNameArgs.GetArgsStringForDisplay( &display ); int DAGParentNodeNamesLen = display.Length(); // how many additional chars must we still add to command line // NOTE: according to the POSIX spec, the args + // environ given to exec() cannot exceed // _POSIX_ARG_MAX, so we also need to calculate & add // the size of environ** to reserveNeeded int reserveNeeded = strlen( cmdFile ); int maxCmdLine = _POSIX_ARG_MAX; // if we don't have room for DAGParentNodeNames, leave it unset if( cmdLineSize + reserveNeeded + DAGParentNodeNamesLen > maxCmdLine ) { debug_printf( DEBUG_NORMAL, "Warning: node %s has too many parents " "to list in its classad; leaving its DAGParentNodeNames " "attribute undefined\n", DAGNodeName ); check_warning_strictness( DAG_STRICT_3 ); } else { args.AppendArgsFromArgList( parentNameArgs ); } if( hold_claim ){ args.AppendArg( "-a" ); MyString holdit = MyString("+") + MyString(ATTR_JOB_KEEP_CLAIM_IDLE) + " = " + dm._claim_hold_time; args.AppendArg( holdit.Value() ); } if (dm._submitDagDeepOpts.suppress_notification) { args.AppendArg( "-a" ); MyString notify = MyString("notification = never"); args.AppendArg( notify.Value() ); } args.AppendArg( cmdFile ); bool success = do_submit( args, condorID, dm.prohibitMultiJobs ); if ( !tmpDir.Cd2MainDir( errMsg ) ) { debug_printf( DEBUG_QUIET, "Could not change to original directory: %s\n", errMsg.Value() ); success = false; } return success; }
void condor_event_timer () { ASSERT( dagman.dag != NULL ); //------------------------------------------------------------------------ // Proceed with normal operation // // At this point, the DAG is bootstrapped. All jobs premarked DONE // are in a STATUS_DONE state, and all their children have been // marked ready to submit. // // If recovery was needed, the log file has been completely read and // we are ready to proceed with jobs yet unsubmitted. //------------------------------------------------------------------------ if( dagman.paused == true ) { debug_printf( DEBUG_DEBUG_1, "(DAGMan paused)\n" ); return; } static int prevJobsDone = 0; static int prevJobs = 0; static int prevJobsFailed = 0; static int prevJobsSubmitted = 0; static int prevJobsReady = 0; static int prevScriptRunNodes = 0; static int prevJobsHeld = 0; int justSubmitted; justSubmitted = dagman.dag->SubmitReadyJobs(dagman); if( justSubmitted ) { // Note: it would be nice to also have the proc submit // count here. wenger, 2006-02-08. debug_printf( DEBUG_VERBOSE, "Just submitted %d job%s this cycle...\n", justSubmitted, justSubmitted == 1 ? "" : "s" ); } // If the log has grown if( dagman.dag->DetectCondorLogGrowth() ) { if( dagman.dag->ProcessLogEvents( CONDORLOG ) == false ) { debug_printf( DEBUG_NORMAL, "ProcessLogEvents(CONDORLOG) returned false\n" ); dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR ); return; } } if( dagman.dag->DetectDaPLogGrowth() ) { if( dagman.dag->ProcessLogEvents( DAPLOG ) == false ) { debug_printf( DEBUG_NORMAL, "ProcessLogEvents(DAPLOG) returned false\n" ); dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR ); return; } } // print status if anything's changed (or we're in a high debug level) if( prevJobsDone != dagman.dag->NumNodesDone( true ) || prevJobs != dagman.dag->NumNodes( true ) || prevJobsFailed != dagman.dag->NumNodesFailed() || prevJobsSubmitted != dagman.dag->NumJobsSubmitted() || prevJobsReady != dagman.dag->NumNodesReady() || prevScriptRunNodes != dagman.dag->ScriptRunNodeCount() || prevJobsHeld != dagman.dag->NumHeldJobProcs() || DEBUG_LEVEL( DEBUG_DEBUG_4 ) ) { print_status(); prevJobsDone = dagman.dag->NumNodesDone( true ); prevJobs = dagman.dag->NumNodes( true ); prevJobsFailed = dagman.dag->NumNodesFailed(); prevJobsSubmitted = dagman.dag->NumJobsSubmitted(); prevJobsReady = dagman.dag->NumNodesReady(); prevScriptRunNodes = dagman.dag->ScriptRunNodeCount(); prevJobsHeld = dagman.dag->NumHeldJobProcs(); if( dagman.dag->GetDotFileUpdate() ) { dagman.dag->DumpDotFile(); } } dagman.dag->DumpNodeStatus( false, false ); ASSERT( dagman.dag->NumNodesDone( true ) + dagman.dag->NumNodesFailed() <= dagman.dag->NumNodes( true ) ); // // If DAG is complete, hurray, and exit. // if( dagman.dag->DoneSuccess( true ) ) { ASSERT( dagman.dag->NumJobsSubmitted() == 0 ); dagman.dag->CheckAllJobs(); debug_printf( DEBUG_NORMAL, "All jobs Completed!\n" ); dagman.dag->PrintDeferrals( DEBUG_NORMAL, true ); if ( dagman.dag->NumIdleJobProcs() != 0 ) { debug_printf( DEBUG_NORMAL, "Warning: DAGMan thinks there " "are %d idle jobs, even though the DAG is " "completed!\n", dagman.dag->NumIdleJobProcs() ); check_warning_strictness( DAG_STRICT_1 ); } ExitSuccess(); return; } // // DAG has failed -- dump rescue DAG. // if( dagman.dag->DoneFailed( true ) ) { main_shutdown_rescue( EXIT_ERROR, dagman.dag->_dagStatus ); return; } // // DAG has succeeded but we haven't run final node yet, so do that. // if( dagman.dag->DoneSuccess( false ) ) { dagman.dag->StartFinalNode(); return; } // If the DAG is halted, we don't want to actually exit yet if // jobs are still in the queue, or any POST scripts need to be // run (we need to run POST scripts so we don't "waste" jobs // that completed; on the other hand, we don't care about waiting // for PRE scripts because they'll be re-run when the rescue // DAG is run anyhow). if ( dagman.dag->IsHalted() && dagman.dag->NumJobsSubmitted() == 0 && dagman.dag->PostRunNodeCount() == 0 && !dagman.dag->RunningFinalNode() ) { debug_printf ( DEBUG_QUIET, "Exiting because DAG is halted " "and no jobs or scripts are running\n" ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_HALTED ); return; } // // If no jobs are submitted and no scripts are running, but the // dag is not complete, then at least one job failed, or a cycle // exists. (Note that if the DAG completed successfully, we already // returned from this function above.) // if( dagman.dag->FinishedRunning( false ) ) { Dag::dag_status dagStatus = Dag::DAG_STATUS_OK; if( dagman.dag->DoneFailed( false ) ) { if( DEBUG_LEVEL( DEBUG_QUIET ) ) { debug_printf( DEBUG_QUIET, "ERROR: the following job(s) failed:\n" ); dagman.dag->PrintJobList( Job::STATUS_ERROR ); } dagStatus = Dag::DAG_STATUS_NODE_FAILED; } else { // no jobs failed, so a cycle must exist debug_printf( DEBUG_QUIET, "ERROR: DAG finished but not all " "nodes are complete -- checking for a cycle...\n" ); if( dagman.dag->isCycle() ) { debug_printf (DEBUG_QUIET, "... ERROR: a cycle exists " "in the dag, please check input\n"); dagStatus = Dag::DAG_STATUS_CYCLE; } else { debug_printf (DEBUG_QUIET, "... ERROR: no cycle found; " "unknown error condition\n"); dagStatus = Dag::DAG_STATUS_ERROR; } if ( debug_level >= DEBUG_NORMAL ) { dagman.dag->PrintJobList(); } } main_shutdown_rescue( EXIT_ERROR, dagStatus ); return; } }
// // In Config() we get DAGMan-related configuration values. This // is a three-step process: // 1. Get the name of the DAGMan-specific config file (if any). // 2. If there is a DAGMan-specific config file, process it so // that its values are added to the configuration. // 3. Get the values we want from the configuration. // bool Dagman::Config() { int debug_cache_size = (1024*1024)*5; // 5 MB bool debug_cache_enabled = false; // Note: debug_printfs are DEBUG_NORMAL here because when we // get here we haven't processed command-line arguments yet. // Get and process the DAGMan-specific config file (if any) // before getting any of the other parameters. _dagmanConfigFile = param( "DAGMAN_CONFIG_FILE" ); if ( _dagmanConfigFile ) { debug_printf( DEBUG_NORMAL, "Using DAGMan config file: %s\n", _dagmanConfigFile ); // We do this test here because the corresponding error // message from the config code doesn't show up in dagman.out. if ( access( _dagmanConfigFile, R_OK ) != 0 && !is_piped_command( _dagmanConfigFile ) ) { debug_printf( DEBUG_QUIET, "ERROR: Can't read DAGMan config file: %s\n", _dagmanConfigFile ); DC_Exit( EXIT_ERROR ); } process_config_source( _dagmanConfigFile, "DAGMan config", NULL, true ); } _strict = (strict_level_t)param_integer( "DAGMAN_USE_STRICT", _strict, DAG_STRICT_0, DAG_STRICT_3 ); debug_printf( DEBUG_NORMAL, "DAGMAN_USE_STRICT setting: %d\n", _strict ); debug_level = (debug_level_t)param_integer( "DAGMAN_VERBOSITY", debug_level, DEBUG_SILENT, DEBUG_DEBUG_4 ); debug_printf( DEBUG_NORMAL, "DAGMAN_VERBOSITY setting: %d\n", debug_level ); debug_cache_size = param_integer( "DAGMAN_DEBUG_CACHE_SIZE", debug_cache_size, 0, INT_MAX); debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_SIZE setting: %d\n", debug_cache_size ); debug_cache_enabled = param_boolean( "DAGMAN_DEBUG_CACHE_ENABLE", debug_cache_enabled ); debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG_CACHE_ENABLE setting: %s\n", debug_cache_enabled?"True":"False" ); submit_delay = param_integer( "DAGMAN_SUBMIT_DELAY", submit_delay, 0); debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DELAY setting: %d\n", submit_delay ); max_submit_attempts = param_integer( "DAGMAN_MAX_SUBMIT_ATTEMPTS", max_submit_attempts, 1, 16 ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMIT_ATTEMPTS setting: %d\n", max_submit_attempts ); startup_cycle_detect = param_boolean( "DAGMAN_STARTUP_CYCLE_DETECT", startup_cycle_detect ); debug_printf( DEBUG_NORMAL, "DAGMAN_STARTUP_CYCLE_DETECT setting: %s\n", startup_cycle_detect ? "True" : "False" ); max_submits_per_interval = param_integer( "DAGMAN_MAX_SUBMITS_PER_INTERVAL", max_submits_per_interval, 1, 1000 ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_SUBMITS_PER_INTERVAL setting: %d\n", max_submits_per_interval ); m_user_log_scan_interval = param_integer( "DAGMAN_USER_LOG_SCAN_INTERVAL", m_user_log_scan_interval, 1, INT_MAX); debug_printf( DEBUG_NORMAL, "DAGMAN_USER_LOG_SCAN_INTERVAL setting: %d\n", m_user_log_scan_interval ); _defaultPriority = param_integer("DAGMAN_DEFAULT_PRIORITY", 0, INT_MIN, INT_MAX, false); _submitDagDeepOpts.always_use_node_log = param_boolean( "DAGMAN_ALWAYS_USE_NODE_LOG", true); // Event checking setup... // We want to default to allowing the terminated/aborted // combination (that's what we've defaulted to in the past). // Okay, we also want to allow execute before submit because // we've run into that, and since DAGMan doesn't really care // about the execute events, it shouldn't abort the DAG. // And we further want to allow two terminated events for a // single job because people are seeing that with Globus // jobs!! allow_events = CheckEvents::ALLOW_TERM_ABORT | CheckEvents::ALLOW_EXEC_BEFORE_SUBMIT | CheckEvents::ALLOW_DOUBLE_TERMINATE | CheckEvents::ALLOW_DUPLICATE_EVENTS; // If the old DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION param is set, // we also allow extra runs. // Note: this parameter is probably only used by CDF, and only // really needed until they update all their systems to 6.7.3 // or later (not 6.7.3 pre-release), which fixes the "double-run" // bug. bool allowExtraRuns = param_boolean( "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION", false ); if ( allowExtraRuns ) { allow_events |= CheckEvents::ALLOW_RUN_AFTER_TERM; debug_printf( DEBUG_NORMAL, "Warning: " "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION " "is deprecated -- used DAGMAN_ALLOW_EVENTS instead\n" ); check_warning_strictness( DAG_STRICT_1 ); } // Now get the new DAGMAN_ALLOW_EVENTS value -- that can override // all of the previous stuff. allow_events = param_integer("DAGMAN_ALLOW_EVENTS", allow_events); debug_printf( DEBUG_NORMAL, "allow_events (" "DAGMAN_IGNORE_DUPLICATE_JOB_EXECUTION, DAGMAN_ALLOW_EVENTS" ") setting: %d\n", allow_events ); // ...end of event checking setup. retrySubmitFirst = param_boolean( "DAGMAN_RETRY_SUBMIT_FIRST", retrySubmitFirst ); debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_SUBMIT_FIRST setting: %s\n", retrySubmitFirst ? "True" : "False" ); retryNodeFirst = param_boolean( "DAGMAN_RETRY_NODE_FIRST", retryNodeFirst ); debug_printf( DEBUG_NORMAL, "DAGMAN_RETRY_NODE_FIRST setting: %s\n", retryNodeFirst ? "True" : "False" ); maxIdle = param_integer( "DAGMAN_MAX_JOBS_IDLE", maxIdle, 0, INT_MAX ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_IDLE setting: %d\n", maxIdle ); maxJobs = param_integer( "DAGMAN_MAX_JOBS_SUBMITTED", maxJobs, 0, INT_MAX ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_JOBS_SUBMITTED setting: %d\n", maxJobs ); maxPreScripts = param_integer( "DAGMAN_MAX_PRE_SCRIPTS", maxPreScripts, 0, INT_MAX ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_PRE_SCRIPTS setting: %d\n", maxPreScripts ); maxPostScripts = param_integer( "DAGMAN_MAX_POST_SCRIPTS", maxPostScripts, 0, INT_MAX ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_POST_SCRIPTS setting: %d\n", maxPostScripts ); allowLogError = param_boolean( "DAGMAN_ALLOW_LOG_ERROR", allowLogError ); debug_printf( DEBUG_NORMAL, "DAGMAN_ALLOW_LOG_ERROR setting: %s\n", allowLogError ? "True" : "False" ); mungeNodeNames = param_boolean( "DAGMAN_MUNGE_NODE_NAMES", mungeNodeNames ); debug_printf( DEBUG_NORMAL, "DAGMAN_MUNGE_NODE_NAMES setting: %s\n", mungeNodeNames ? "True" : "False" ); prohibitMultiJobs = param_boolean( "DAGMAN_PROHIBIT_MULTI_JOBS", prohibitMultiJobs ); debug_printf( DEBUG_NORMAL, "DAGMAN_PROHIBIT_MULTI_JOBS setting: %s\n", prohibitMultiJobs ? "True" : "False" ); submitDepthFirst = param_boolean( "DAGMAN_SUBMIT_DEPTH_FIRST", submitDepthFirst ); debug_printf( DEBUG_NORMAL, "DAGMAN_SUBMIT_DEPTH_FIRST setting: %s\n", submitDepthFirst ? "True" : "False" ); _runPost = param_boolean( "DAGMAN_ALWAYS_RUN_POST", true ); debug_printf( DEBUG_NORMAL, "DAGMAN_ALWAYS_RUN_POST setting: %s\n", _runPost ? "True" : "False" ); free( condorSubmitExe ); condorSubmitExe = param( "DAGMAN_CONDOR_SUBMIT_EXE" ); if( !condorSubmitExe ) { condorSubmitExe = strdup( "condor_submit" ); ASSERT( condorSubmitExe ); } free( condorRmExe ); condorRmExe = param( "DAGMAN_CONDOR_RM_EXE" ); if( !condorRmExe ) { condorRmExe = strdup( "condor_rm" ); ASSERT( condorRmExe ); } free( storkSubmitExe ); storkSubmitExe = param( "DAGMAN_STORK_SUBMIT_EXE" ); if( !storkSubmitExe ) { storkSubmitExe = strdup( "stork_submit" ); ASSERT( storkSubmitExe ); } free( storkRmExe ); storkRmExe = param( "DAGMAN_STORK_RM_EXE" ); if( !storkRmExe ) { storkRmExe = strdup( "stork_rm" ); ASSERT( storkRmExe ); } abortDuplicates = param_boolean( "DAGMAN_ABORT_DUPLICATES", abortDuplicates ); debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_DUPLICATES setting: %s\n", abortDuplicates ? "True" : "False" ); abortOnScarySubmit = param_boolean( "DAGMAN_ABORT_ON_SCARY_SUBMIT", abortOnScarySubmit ); debug_printf( DEBUG_NORMAL, "DAGMAN_ABORT_ON_SCARY_SUBMIT setting: %s\n", abortOnScarySubmit ? "True" : "False" ); pendingReportInterval = param_integer( "DAGMAN_PENDING_REPORT_INTERVAL", pendingReportInterval ); debug_printf( DEBUG_NORMAL, "DAGMAN_PENDING_REPORT_INTERVAL setting: %d\n", pendingReportInterval ); if ( param_boolean( "DAGMAN_OLD_RESCUE", false ) ) { debug_printf( DEBUG_NORMAL, "Warning: DAGMAN_OLD_RESCUE is " "no longer supported\n" ); check_warning_strictness( DAG_STRICT_1 ); } autoRescue = param_boolean( "DAGMAN_AUTO_RESCUE", autoRescue ); debug_printf( DEBUG_NORMAL, "DAGMAN_AUTO_RESCUE setting: %s\n", autoRescue ? "True" : "False" ); maxRescueDagNum = param_integer( "DAGMAN_MAX_RESCUE_NUM", maxRescueDagNum, 0, ABS_MAX_RESCUE_DAG_NUM ); debug_printf( DEBUG_NORMAL, "DAGMAN_MAX_RESCUE_NUM setting: %d\n", maxRescueDagNum ); _writePartialRescueDag = param_boolean( "DAGMAN_WRITE_PARTIAL_RESCUE", _writePartialRescueDag ); debug_printf( DEBUG_NORMAL, "DAGMAN_WRITE_PARTIAL_RESCUE setting: %s\n", _writePartialRescueDag ? "True" : "False" ); free( _defaultNodeLog ); _defaultNodeLog = param( "DAGMAN_DEFAULT_NODE_LOG" ); debug_printf( DEBUG_NORMAL, "DAGMAN_DEFAULT_NODE_LOG setting: %s\n", _defaultNodeLog ? _defaultNodeLog : "null" ); _generateSubdagSubmits = param_boolean( "DAGMAN_GENERATE_SUBDAG_SUBMITS", _generateSubdagSubmits ); debug_printf( DEBUG_NORMAL, "DAGMAN_GENERATE_SUBDAG_SUBMITS setting: %s\n", _generateSubdagSubmits ? "True" : "False" ); _maxJobHolds = param_integer( "DAGMAN_MAX_JOB_HOLDS", _maxJobHolds, 0, 1000000 ); _claim_hold_time = param_integer( "DAGMAN_HOLD_CLAIM_TIME", _claim_hold_time, 0, 3600); char *debugSetting = param( "ALL_DEBUG" ); debug_printf( DEBUG_NORMAL, "ALL_DEBUG setting: %s\n", debugSetting ? debugSetting : "" ); if ( debugSetting ) { free( debugSetting ); } debugSetting = param( "DAGMAN_DEBUG" ); debug_printf( DEBUG_NORMAL, "DAGMAN_DEBUG setting: %s\n", debugSetting ? debugSetting : "" ); if ( debugSetting ) { free( debugSetting ); } // enable up the debug cache if needed if (debug_cache_enabled) { debug_cache_set_size(debug_cache_size); debug_cache_enable(); } return true; }
//--------------------------------------------------------------------------- void main_init (int argc, char ** const argv) { printf ("Executing condor dagman ... \n"); // flag used if DAGMan is invoked with -WaitForDebug so we // wait for a developer to attach with a debugger... volatile int wait_for_debug = 0; // process any config vars -- this happens before we process // argv[], since arguments should override config settings dagman.Config(); // The DCpermission (last parm) should probably be PARENT, if it existed daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", (SignalHandler) main_shutdown_remove, "main_shutdown_remove", NULL); /****** FOR TESTING ******* daemonCore->Register_Signal( SIGUSR2, "SIGUSR2", (SignalHandler) main_testing_stub, "main_testing_stub", NULL); ****** FOR TESTING ********/ debug_progname = condor_basename(argv[0]); // condor_submit_dag version from .condor.sub bool allowVerMismatch = false; const char *csdVersion = "undefined"; int i; for (i = 0 ; i < argc ; i++) { debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] ); } if (argc < 2) Usage(); // Make sure an input file was specified // get dagman job id from environment, if it's there // (otherwise it will be set to "-1.-1.-1") dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Minimum legal version for a .condor.sub file to be compatible // with this condor_dagman binary. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Be sure to change this if the arguments or environment // passed to condor_dagman change in an incompatible way!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! struct DagVersionData { int majorVer; int minorVer; int subMinorVer; }; const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 }; // Construct a string of the minimum submit file version. MyString minSubmitVersionStr; minSubmitVersionStr.formatstr( "%d.%d.%d", MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // Process command-line arguments // for (i = 1; i < argc; i++) { if( !strcasecmp( "-Debug", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No debug level specified\n" ); Usage(); } debug_level = (debug_level_t) atoi (argv[i]); } else if( !strcasecmp( "-Lockfile", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" ); Usage(); } lockFileName = argv[i]; } else if( !strcasecmp( "-Help", argv[i] ) ) { Usage(); } else if (!strcasecmp( "-Dag", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DAG specified\n" ); Usage(); } dagman.dagFiles.append( argv[i] ); } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxIdle\n" ); Usage(); } dagman.maxIdle = atoi( argv[i] ); } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxJobs\n" ); Usage(); } dagman.maxJobs = atoi( argv[i] ); } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) { debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with " "-MaxPre and -MaxPost arguments\n" ); Usage(); } else if( !strcasecmp( "-MaxPre", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPre\n" ); Usage(); } dagman.maxPreScripts = atoi( argv[i] ); } else if( !strcasecmp( "-MaxPost", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPost\n" ); Usage(); } dagman.maxPostScripts = atoi( argv[i] ); } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) { debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is " "ignored; please use the DAGMAN_ALLOW_EVENTS " "config parameter instead\n"); check_warning_strictness( DAG_STRICT_1 ); } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) { dagman.allowLogError = true; } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) { dagman._runPost = false; } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) { wait_for_debug = 1; } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) { dagman.useDagDir = true; } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" ); Usage(); } dagman.autoRescue = (atoi( argv[i] ) != 0); } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" ); Usage(); } dagman.doRescueFrom = atoi (argv[i]); } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" ); Usage(); } csdVersion = argv[i]; } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) { allowVerMismatch = true; } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) { dagman.dumpRescueDag = true; } else if( !strcasecmp( "-verbose", argv[i] ) ) { dagman._submitDagDeepOpts.bVerbose = true; } else if( !strcasecmp( "-force", argv[i] ) ) { dagman._submitDagDeepOpts.bForce = true; } else if( !strcasecmp( "-notification", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No notification value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strNotification = argv[i]; } else if( !strcasecmp( "-dagman", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No dagman value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strDagmanPath = argv[i]; } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strOutfileDir = argv[i]; } else if( !strcasecmp( "-update_submit", argv[i] ) ) { dagman._submitDagDeepOpts.updateSubmit = true; } else if( !strcasecmp( "-import_env", argv[i] ) ) { dagman._submitDagDeepOpts.importEnv = true; } else if( !strcasecmp( "-priority", argv[i] ) ) { ++i; if( i >= argc || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_NORMAL, "No priority value specified\n"); Usage(); } dagman._submitDagDeepOpts.priority = atoi(argv[i]); } else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) { dagman._submitDagDeepOpts.always_use_node_log = false; } else { debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n", argv[i] ); Usage(); } } dagman.dagFiles.rewind(); dagman.primaryDagFile = dagman.dagFiles.next(); dagman.multiDags = (dagman.dagFiles.number() > 1); MyString tmpDefaultLog; if ( dagman._defaultNodeLog != NULL ) { tmpDefaultLog = dagman._defaultNodeLog; free( dagman._defaultNodeLog ); } else { tmpDefaultLog = dagman.primaryDagFile + ".nodes.log"; } // Force default log file path to be absolute so it works // with -usedagdir and DIR nodes. CondorError errstack; if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) { debug_printf( DEBUG_QUIET, "Unable to convert default log " "file name to absolute path: %s\n", errstack.getFullText().c_str() ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR ); DC_Exit( EXIT_ERROR ); } dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() ); debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n", dagman._defaultNodeLog); // // Check the arguments // //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Checking for version compatibility between the .condor.sub // file and this condor_dagman binary... // Note: if we're in recovery mode and the submit file version // causes us to quit, we leave any existing node jobs still // running -- may want to change that eventually. wenger 2009-10-13. // Version of the condor_submit_dag that created our submit file. CondorVersionInfo submitFileVersion( csdVersion ); // Version of this condor_dagman binary. CondorVersionInfo dagmanVersion; // Just generate this message fragment in one place. MyString versionMsg; versionMsg.formatstr("the version (%s) of this DAG's Condor submit " "file (created by condor_submit_dag)", csdVersion ); // Make sure version in submit file is valid. if( !submitFileVersion.is_valid() ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n", versionMsg.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; " "continuing because of -AllowVersionMismatch flag\n", versionMsg.Value() ); } // Make sure .condor.sub file is recent enough. } else if ( submitFileVersion.compare_versions( CondorVersion() ) != 0 ) { if( !submitFileVersion.built_since_version( MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is older than " "oldest permissible version (%s)\n", versionMsg.Value(), minSubmitVersionStr.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is older than " "oldest permissible version (%s); continuing " "because of -AllowVersionMismatch flag\n", versionMsg.Value(), minSubmitVersionStr.Value() ); } // Warn if .condor.sub file is a newer version than this binary. } else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) { debug_printf( DEBUG_NORMAL, "Warning: %s is newer than " "condor_dagman version (%s)\n", versionMsg.Value(), CondorVersion() ); check_warning_strictness( DAG_STRICT_3 ); } else { debug_printf( DEBUG_NORMAL, "Note: %s differs from " "condor_dagman version (%s), but the " "difference is permissible\n", versionMsg.Value(), CondorVersion() ); } } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if( dagman.primaryDagFile == "" ) { debug_printf( DEBUG_SILENT, "No DAG file was specified\n" ); Usage(); } if (lockFileName == NULL) { debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" ); Usage(); } if( dagman.maxJobs < 0 ) { debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n"); Usage(); } if( dagman.maxPreScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" ); Usage(); } if( dagman.maxPostScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" ); Usage(); } if( dagman.doRescueFrom < 0 ) { debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" ); Usage(); } debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n", lockFileName ); if ( dagman.dagFiles.number() == 1 ) { debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n", dagman.primaryDagFile.Value() ); } else { MyString msg = "DAG Input files are "; dagman.dagFiles.rewind(); const char *dagFile; while ( (dagFile = dagman.dagFiles.next()) != NULL ) { msg += dagFile; msg += " "; } msg += "\n"; debug_printf( DEBUG_VERBOSE, "%s", msg.Value() ); } // if requested, wait for someone to attach with a debugger... while( wait_for_debug ) { } { MyString cwd; if( !condor_getcwd(cwd) ) { cwd = "<null>"; } debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value()); char *temp = my_username(); debug_printf( DEBUG_DEBUG_1, "Current user is %s\n", temp ? temp : "<null>" ); if( temp ) { free( temp ); } } // // Figure out the rescue DAG to run, if any (this is with "new- // style" rescue DAGs). // int rescueDagNum = 0; MyString rescueDagMsg; if ( dagman.doRescueFrom != 0 ) { rescueDagNum = dagman.doRescueFrom; rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum ); RenameRescueDagsAfter( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum ); } else if ( dagman.autoRescue ) { rescueDagNum = FindLastRescueDagNum( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum ); rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum ); } // // Fill in values in the deep submit options that we haven't // already set. // dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError; dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir; dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue; dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom; dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch; dagman._submitDagDeepOpts.recurse = false; // // Create the DAG // // Note: a bunch of the parameters we pass here duplicate things // in submitDagOpts, but I'm keeping them separate so we don't have to // bother to construct a new SubmitDagOtions object for splices. // wenger 2010-03-25 dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs, dagman.maxPreScripts, dagman.maxPostScripts, dagman.allowLogError, dagman.useDagDir, dagman.maxIdle, dagman.retrySubmitFirst, dagman.retryNodeFirst, dagman.condorRmExe, dagman.storkRmExe, &dagman.DAGManJobId, dagman.prohibitMultiJobs, dagman.submitDepthFirst, dagman._defaultNodeLog, dagman._generateSubdagSubmits, &dagman._submitDagDeepOpts, false ); /* toplevel dag! */ if( dagman.dag == NULL ) { EXCEPT( "ERROR: out of memory!\n"); } dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit ); dagman.dag->SetAllowEvents( dagman.allow_events ); dagman.dag->SetConfigFile( dagman._dagmanConfigFile ); dagman.dag->SetMaxJobHolds( dagman._maxJobHolds ); dagman.dag->SetPostRun(dagman._runPost); if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority); } else if( dagman._defaultPriority != 0 ) { // From config file dagman.dag->SetDefaultPriority(dagman._defaultPriority); dagman._submitDagDeepOpts.priority = dagman._defaultPriority; } // // Parse the input files. The parse() routine // takes care of adding jobs and dependencies to the DagMan // dagman.mungeNodeNames = (dagman.dagFiles.number() > 1); parseSetDoNameMunge( dagman.mungeNodeNames ); debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", dagman.dagFiles.number() ); dagman.dagFiles.rewind(); char *dagFile; // Here we make a copy of the dagFiles for iteration purposes. Deep inside // of the parsing, copies of the dagman.dagFile string list happen which // mess up the iteration of this list. StringList sl( dagman.dagFiles ); sl.rewind(); while ( (dagFile = sl.next()) != NULL ) { debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile ); if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } if( dagman.dag->GetDefaultPriority() != 0 ) { dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag } dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId ); if ( rescueDagNum > 0 ) { // Get our Pegasus sequence numbers set correctly. dagman.dag->GetJobstateLog().InitializeRescue(); } // lift the final set of splices into the main dag. dagman.dag->LiftSplices(SELF); // // Actually parse the "new-new" style (partial DAG info only) // rescue DAG here. Note: this *must* be done after splices // are lifted! // if ( rescueDagNum > 0 ) { dagman.rescueFileToRun = RescueDagName( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum ); debug_printf ( DEBUG_QUIET, "%s; running %s in combination with " "normal DAG file%s\n", rescueDagMsg.Value(), dagman.rescueFileToRun.Value(), dagman.multiDags ? "s" : ""); debug_printf ( DEBUG_QUIET, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n", dagman.rescueFileToRun.Value() ); // Turn off node name munging for the rescue DAG, because // it will already have munged node names. parseSetDoNameMunge( false ); if( !parse( dagman.dag, dagman.rescueFileToRun.Value(), dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } dagman.dag->CheckThrottleCats(); // fix up any use of $(JOB) in the vars values for any node dagman.dag->ResolveVarsInterpolations(); /* debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/ /* dagman.dag->PrintJobList();*/ #ifndef NOT_DETECT_CYCLE if( dagman.startup_cycle_detect && dagman.dag->isCycle() ) { // Note: maybe we should run the final node here, if there is one. // wenger 2011-12-19. debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n"); } #endif debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n", dagman.dag->NumNodes( true ) ); MyString firstLocation; if ( dagman.dag->GetReject( firstLocation ) ) { debug_printf( DEBUG_QUIET, "Exiting because of REJECT " "specification in %s. This most likely means " "that the DAG file was produced with the -DumpRescue " "flag when parsing the original DAG failed.\n", firstLocation.Value() ); DC_Exit( EXIT_ERROR ); return; } dagman.dag->DumpDotFile(); if ( dagman.dumpRescueDag ) { debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, false, false ); ExitSuccess(); return; } //------------------------------------------------------------------------ // Bootstrap and Recovery // // If the Lockfile exists, this indicates a premature termination // of a previous run of Dagman. If condor log is also present, // we run in recovery mode // If the Daglog is not present, then we do not run in recovery // mode { bool recovery = access(lockFileName, F_OK) == 0; if (recovery) { debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n", lockFileName); if (dagman.abortDuplicates) { if (util_check_lock_file(lockFileName) == 1) { debug_printf( DEBUG_QUIET, "Aborting because it " "looks like another instance of DAGMan is " "currently running on this DAG; if that is " "not the case, delete the lock file (%s) " "and re-submit the DAG.\n", lockFileName ); dagman.dag->GetJobstateLog(). WriteDagmanFinished( EXIT_RESTART ); dagman.CleanUp(); DC_Exit( EXIT_ERROR ); // We should never get to here! } } } // // If this DAGMan continues, it should overwrite the lock // file if it exists. // util_create_lock_file(lockFileName, dagman.abortDuplicates); debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n"); if( !dagman.dag->Bootstrap( recovery ) ) { dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n"); } } debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" ); daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, condor_event_timer, "condor_event_timer" ); dagman.dag->SetPendingNodeReportInterval( dagman.pendingReportInterval ); }
int util_create_lock_file(const char *lockFileName, bool abortDuplicates) { int result = 0; FILE *fp = safe_fopen_wrapper_follow( lockFileName, "w" ); if ( fp == NULL ) { debug_printf( DEBUG_QUIET, "ERROR: could not open lock file %s for writing.\n", lockFileName); result = -1; } // // Create the ProcessId object. // ProcessId *procId = NULL; if ( result == 0 && abortDuplicates ) { int status; int precision_range = 1; if ( ProcAPI::createProcessId( daemonCore->getpid(), procId, status, &precision_range ) != PROCAPI_SUCCESS ) { debug_printf( DEBUG_QUIET, "ERROR: ProcAPI::createProcessId() " "failed; %d\n", status ); result = -1; } } // // Write out the ProcessId object. // if ( result == 0 && abortDuplicates ) { if ( procId->write( fp ) != ProcessId::SUCCESS ) { debug_printf( DEBUG_QUIET, "ERROR: ProcessId::write() failed\n"); result = -1; } } // // Sleep to ensure uniqueness of the ProcessId object. // if ( result == 0 && abortDuplicates ) { const int maxSleepTime = 60; // seconds; arbitrarily chosen int sleepTime = procId->computeWaitTime(); if ( sleepTime > maxSleepTime ) { debug_printf( DEBUG_QUIET, "Warning: ProcessId computed sleep " "time (%d) exceeds maximum (%d); skipping sleep/" "confirm step\n", sleepTime, maxSleepTime ); check_warning_strictness( DAG_STRICT_3 ); } else { debug_printf( DEBUG_NORMAL, "Sleeping for %d seconds to " "ensure ProcessId uniqueness\n", sleepTime ); #if defined(WIN32) sleep( sleepTime ); #else while( (sleepTime = sleep( sleepTime ) ) != 0 ) { } #endif // // Confirm the ProcessId object's uniqueness. // int status; if ( ProcAPI::confirmProcessId( *procId, status ) != PROCAPI_SUCCESS ) { debug_printf( DEBUG_QUIET, "Warning: ProcAPI::" "confirmProcessId() failed; %d\n", status ); check_warning_strictness( DAG_STRICT_3 ); } else { if ( !procId->isConfirmed() ) { debug_printf( DEBUG_QUIET, "Warning: ProcessId not " "confirmed unique\n" ); check_warning_strictness( DAG_STRICT_3 ); } else { // // Write out the confirmation. // if ( procId->writeConfirmationOnly( fp ) != ProcessId::SUCCESS ) { debug_printf( DEBUG_QUIET, "ERROR: ProcessId::" "writeConfirmationOnly() failed\n"); result = -1; } } } } } delete procId; if ( fp != NULL ) { if ( fclose( fp ) != 0 ) { debug_printf( DEBUG_QUIET, "ERROR: closing lock " "file failed with errno %d (%s)\n", errno, strerror( errno ) ); } } return result; }
//--------------------------------------------------------------------------- bool Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader, bool nfsIsError, bool recovery, const char *defaultNodeLog, bool usingDefault ) { debug_printf( DEBUG_DEBUG_2, "Attempting to monitor log file for node %s\n", GetJobName() ); if ( _logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already monitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; std::string logFileStr; if ( _jobType == TYPE_CONDOR ) { // We check to see if the user has specified a log file // If not, we give him a default MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile, _directory, _logFileIsXml, usingDefault); logFileStr = templogFileStr.Value(); } else { StringList logFiles; MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile( _cmdFile, _directory, logFiles ); if ( tmpResult != "" ) { debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n", tmpResult.Value() ); LogMonitorFailed(); return false; } else if ( logFiles.number() != 1 ) { debug_printf( DEBUG_QUIET, "Error: %d Stork log files found " "in submit file %s; we want 1\n", logFiles.number(), _cmdFile ); LogMonitorFailed(); return false; } else { logFiles.rewind(); logFileStr = logFiles.next(); } } // Warn the user if the node's log file is in /tmp. if ( logFileStr.find( "/tmp" ) == 0 ) { debug_printf( DEBUG_QUIET, "Warning: " "Log file %s for node %s is in /tmp\n", logFileStr.c_str(), GetJobName() ); check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 ); } if ( logFileStr == "" ) { logFileStr = defaultNodeLog; _useDefaultLog = true; // Default User log is never XML // This could be specified in the submit file and should be // ignored. _logFileIsXml = false; debug_printf( DEBUG_NORMAL, "Unable to get log file from " "submit file %s (node %s); using default (%s)\n", _cmdFile, GetJobName(), logFileStr.c_str() ); append_default_log = false; } else { append_default_log = usingDefault; if( append_default_log ) { // DAGman is not going to look at the user-specified log. // It will look at the defaultNode log. logFileStr = defaultNodeLog; _useDefaultLog = false; _logFileIsXml = false; } } // This function returns true if the log file is on NFS and // that is an error. If the log file is on NFS, but nfsIsError // is false, it prints a warning but returns false. if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(), nfsIsError ) ) { debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n", logFileStr.c_str() ); LogMonitorFailed(); return false; } delete [] _logFile; // Saving log file here in case submit file gets changed. _logFile = strnewp( logFileStr.c_str() ); debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to monitor log file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); LogMonitorFailed(); EXCEPT( "Fatal log file monitoring error!\n" ); return false; } _logIsMonitored = true; return true; }