// this gets called by DC when DAGMan receives a SIGUSR1 -- which, // assuming the DAGMan submit file was properly written, is the signal // the schedd will send if the DAGMan job is removed from the queue int main_shutdown_remove(Service *, int) { debug_printf( DEBUG_QUIET, "Received SIGUSR1\n" ); main_shutdown_rescue( EXIT_ABORT, Dag::DAG_STATUS_RM ); return FALSE; }
static bool submit_try( ArgList &args, CondorID &condorID, bool prohibitMultiJobs ) { MyString cmd; // for debug output args.GetArgsStringForDisplay( &cmd ); FILE * fp = my_popen( args, "r", TRUE ); if (fp == NULL) { debug_printf( DEBUG_NORMAL, "ERROR: my_popen(%s) in submit_try() failed!\n", cmd.Value() ); return false; } //---------------------------------------------------------------------- // Parse submit command output for a HTCondor job ID. This // desperately needs to be replaced by HTCondor submit APIs. // // Typical condor_submit output for HTCondor v6 looks like: // // Submitting job(s). // Logging submit event(s). // 1 job(s) submitted to cluster 2267. //---------------------------------------------------------------------- char buffer[UTIL_MAX_LINE_LENGTH]; buffer[0] = '\0'; // Configure what we look for in the command output according to // which type of job we have. const char *marker = NULL; parse_submit_fnc parseFnc = NULL; marker = " submitted to cluster "; // Note: we *could* check how many jobs got submitted here, and // correlate that with how many submit events we see later on. // I'm not worrying about that for now... wenger 2006-02-07. // We also have to check the number of jobs to get an accurate // count of submitted jobs to report in the dagman.out file. // We should also check whether we got more than one cluster, and // either deal with it correctly or generate an error message. parseFnc = parse_condor_submit; // Take all of the output (both stdout and stderr) from condor_submit, // and echo it to the dagman.out file. Look for // the line (if any) containing the word "cluster" (HTCondor). // If we don't find such a line, something // went wrong with the submit, so we return false. The caller of this // function can retry the submit by repeatedly calling this function. MyString command_output(""); MyString keyLine(""); while (fgets(buffer, UTIL_MAX_LINE_LENGTH, fp)) { MyString buf_line = buffer; buf_line.chomp(); debug_printf(DEBUG_VERBOSE, "From submit: %s\n", buf_line.Value()); command_output += buf_line; if (strstr(buffer, marker) != NULL) { keyLine = buf_line; } } { // Relocated this curly bracket to its previous position to hopefully // fix Coverity warning. Not sure why these curly brackets are here // at all... wenger 2013-06-12 int status = my_pclose(fp) & 0xff; if (keyLine == "") { debug_printf(DEBUG_NORMAL, "failed while reading from pipe.\n"); debug_printf(DEBUG_NORMAL, "Read so far: %s\n", command_output.Value()); return false; } if (status != 0) { debug_printf(DEBUG_NORMAL, "Read from pipe: %s\n", command_output.Value()); debug_printf( DEBUG_QUIET, "ERROR while running \"%s\": " "my_pclose() failed with status %d (errno %d, %s)!\n", cmd.Value(), status, errno, strerror( errno ) ); return false; } } int jobProcCount; if ( !parseFnc( keyLine.Value(), jobProcCount, condorID._cluster) ) { // We are going forward (do not return false here) // Expectation is that higher levels will catch that we // did not get a cluster initialized properly here, fail, // and write a rescue DAG. gt3658 2013-06-03 // // This is better than the old failure that would submit // DAGMAN_MAX_SUBMIT_ATTEMPT copies of the same job. debug_printf( DEBUG_NORMAL, "WARNING: submit returned 0, but " "parsing submit output failed!\n" ); // Returning here so we don't try to process invalid values // below. (This should really return something like "submit failed // don't retry" -- see gittrac #3685.) return true; } // Check for multiple job procs if configured to disallow that. if ( prohibitMultiJobs && (jobProcCount > 1) ) { debug_printf( DEBUG_NORMAL, "Submit generated %d job procs; " "disallowed by DAGMAN_PROHIBIT_MULTI_JOBS setting\n", jobProcCount ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR ); } return true; }
void condor_event_timer () { ASSERT( dagman.dag != NULL ); //------------------------------------------------------------------------ // Proceed with normal operation // // At this point, the DAG is bootstrapped. All jobs premarked DONE // are in a STATUS_DONE state, and all their children have been // marked ready to submit. // // If recovery was needed, the log file has been completely read and // we are ready to proceed with jobs yet unsubmitted. //------------------------------------------------------------------------ if( dagman.paused == true ) { debug_printf( DEBUG_DEBUG_1, "(DAGMan paused)\n" ); return; } static int prevJobsDone = 0; static int prevJobs = 0; static int prevJobsFailed = 0; static int prevJobsSubmitted = 0; static int prevJobsReady = 0; static int prevScriptRunNodes = 0; static int prevJobsHeld = 0; int justSubmitted; justSubmitted = dagman.dag->SubmitReadyJobs(dagman); if( justSubmitted ) { // Note: it would be nice to also have the proc submit // count here. wenger, 2006-02-08. debug_printf( DEBUG_VERBOSE, "Just submitted %d job%s this cycle...\n", justSubmitted, justSubmitted == 1 ? "" : "s" ); } // If the log has grown if( dagman.dag->DetectCondorLogGrowth() ) { if( dagman.dag->ProcessLogEvents( CONDORLOG ) == false ) { debug_printf( DEBUG_NORMAL, "ProcessLogEvents(CONDORLOG) returned false\n" ); dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR ); return; } } if( dagman.dag->DetectDaPLogGrowth() ) { if( dagman.dag->ProcessLogEvents( DAPLOG ) == false ) { debug_printf( DEBUG_NORMAL, "ProcessLogEvents(DAPLOG) returned false\n" ); dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR ); return; } } // print status if anything's changed (or we're in a high debug level) if( prevJobsDone != dagman.dag->NumNodesDone( true ) || prevJobs != dagman.dag->NumNodes( true ) || prevJobsFailed != dagman.dag->NumNodesFailed() || prevJobsSubmitted != dagman.dag->NumJobsSubmitted() || prevJobsReady != dagman.dag->NumNodesReady() || prevScriptRunNodes != dagman.dag->ScriptRunNodeCount() || prevJobsHeld != dagman.dag->NumHeldJobProcs() || DEBUG_LEVEL( DEBUG_DEBUG_4 ) ) { print_status(); prevJobsDone = dagman.dag->NumNodesDone( true ); prevJobs = dagman.dag->NumNodes( true ); prevJobsFailed = dagman.dag->NumNodesFailed(); prevJobsSubmitted = dagman.dag->NumJobsSubmitted(); prevJobsReady = dagman.dag->NumNodesReady(); prevScriptRunNodes = dagman.dag->ScriptRunNodeCount(); prevJobsHeld = dagman.dag->NumHeldJobProcs(); if( dagman.dag->GetDotFileUpdate() ) { dagman.dag->DumpDotFile(); } } dagman.dag->DumpNodeStatus( false, false ); ASSERT( dagman.dag->NumNodesDone( true ) + dagman.dag->NumNodesFailed() <= dagman.dag->NumNodes( true ) ); // // If DAG is complete, hurray, and exit. // if( dagman.dag->DoneSuccess( true ) ) { ASSERT( dagman.dag->NumJobsSubmitted() == 0 ); dagman.dag->CheckAllJobs(); debug_printf( DEBUG_NORMAL, "All jobs Completed!\n" ); dagman.dag->PrintDeferrals( DEBUG_NORMAL, true ); if ( dagman.dag->NumIdleJobProcs() != 0 ) { debug_printf( DEBUG_NORMAL, "Warning: DAGMan thinks there " "are %d idle jobs, even though the DAG is " "completed!\n", dagman.dag->NumIdleJobProcs() ); check_warning_strictness( DAG_STRICT_1 ); } ExitSuccess(); return; } // // DAG has failed -- dump rescue DAG. // if( dagman.dag->DoneFailed( true ) ) { main_shutdown_rescue( EXIT_ERROR, dagman.dag->_dagStatus ); return; } // // DAG has succeeded but we haven't run final node yet, so do that. // if( dagman.dag->DoneSuccess( false ) ) { dagman.dag->StartFinalNode(); return; } // If the DAG is halted, we don't want to actually exit yet if // jobs are still in the queue, or any POST scripts need to be // run (we need to run POST scripts so we don't "waste" jobs // that completed; on the other hand, we don't care about waiting // for PRE scripts because they'll be re-run when the rescue // DAG is run anyhow). if ( dagman.dag->IsHalted() && dagman.dag->NumJobsSubmitted() == 0 && dagman.dag->PostRunNodeCount() == 0 && !dagman.dag->RunningFinalNode() ) { debug_printf ( DEBUG_QUIET, "Exiting because DAG is halted " "and no jobs or scripts are running\n" ); main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_HALTED ); return; } // // If no jobs are submitted and no scripts are running, but the // dag is not complete, then at least one job failed, or a cycle // exists. (Note that if the DAG completed successfully, we already // returned from this function above.) // if( dagman.dag->FinishedRunning( false ) ) { Dag::dag_status dagStatus = Dag::DAG_STATUS_OK; if( dagman.dag->DoneFailed( false ) ) { if( DEBUG_LEVEL( DEBUG_QUIET ) ) { debug_printf( DEBUG_QUIET, "ERROR: the following job(s) failed:\n" ); dagman.dag->PrintJobList( Job::STATUS_ERROR ); } dagStatus = Dag::DAG_STATUS_NODE_FAILED; } else { // no jobs failed, so a cycle must exist debug_printf( DEBUG_QUIET, "ERROR: DAG finished but not all " "nodes are complete -- checking for a cycle...\n" ); if( dagman.dag->isCycle() ) { debug_printf (DEBUG_QUIET, "... ERROR: a cycle exists " "in the dag, please check input\n"); dagStatus = Dag::DAG_STATUS_CYCLE; } else { debug_printf (DEBUG_QUIET, "... ERROR: no cycle found; " "unknown error condition\n"); dagStatus = Dag::DAG_STATUS_ERROR; } if ( debug_level >= DEBUG_NORMAL ) { dagman.dag->PrintJobList(); } } main_shutdown_rescue( EXIT_ERROR, dagStatus ); return; } }