Exemplo n.º 1
0
// this gets called by DC when DAGMan receives a SIGUSR1 -- which,
// assuming the DAGMan submit file was properly written, is the signal
// the schedd will send if the DAGMan job is removed from the queue
int main_shutdown_remove(Service *, int) {
    debug_printf( DEBUG_QUIET, "Received SIGUSR1\n" );
	main_shutdown_rescue( EXIT_ABORT, Dag::DAG_STATUS_RM );
	return FALSE;
}
Exemplo n.º 2
0
static bool
submit_try( ArgList &args, CondorID &condorID, bool prohibitMultiJobs )
{
  MyString cmd; // for debug output
  args.GetArgsStringForDisplay( &cmd );

  FILE * fp = my_popen( args, "r", TRUE );
  if (fp == NULL) {
    debug_printf( DEBUG_NORMAL, 
		  "ERROR: my_popen(%s) in submit_try() failed!\n",
		  cmd.Value() );
    return false;
  }
  
  //----------------------------------------------------------------------
  // Parse submit command output for a HTCondor job ID.  This
  // desperately needs to be replaced by HTCondor submit APIs.
  //
  // Typical condor_submit output for HTCondor v6 looks like:
  //
  //   Submitting job(s).
  //   Logging submit event(s).
  //   1 job(s) submitted to cluster 2267.
  //----------------------------------------------------------------------

  char buffer[UTIL_MAX_LINE_LENGTH];
  buffer[0] = '\0';

  	// Configure what we look for in the command output according to
	// which type of job we have.
  const char *marker = NULL;
  parse_submit_fnc parseFnc = NULL;

  marker = " submitted to cluster ";

  // Note: we *could* check how many jobs got submitted here, and
  // correlate that with how many submit events we see later on.
  // I'm not worrying about that for now...  wenger 2006-02-07.
  // We also have to check the number of jobs to get an accurate
  // count of submitted jobs to report in the dagman.out file.

  // We should also check whether we got more than one cluster, and
  // either deal with it correctly or generate an error message.
  parseFnc = parse_condor_submit;
  
  // Take all of the output (both stdout and stderr) from condor_submit,
  // and echo it to the dagman.out file.  Look for
  // the line (if any) containing the word "cluster" (HTCondor).
  // If we don't find such a line, something
  // went wrong with the submit, so we return false.  The caller of this
  // function can retry the submit by repeatedly calling this function.

  MyString  command_output("");
  MyString keyLine("");
  while (fgets(buffer, UTIL_MAX_LINE_LENGTH, fp)) {
    MyString buf_line = buffer;
	buf_line.chomp();
	debug_printf(DEBUG_VERBOSE, "From submit: %s\n", buf_line.Value());
	command_output += buf_line;
    if (strstr(buffer, marker) != NULL) {
	  keyLine = buf_line;
	}
  }

  { // Relocated this curly bracket to its previous position to hopefully
    // fix Coverity warning.  Not sure why these curly brackets are here
	// at all...  wenger 2013-06-12
    int status = my_pclose(fp) & 0xff;

    if (keyLine == "") {
      debug_printf(DEBUG_NORMAL, "failed while reading from pipe.\n");
      debug_printf(DEBUG_NORMAL, "Read so far: %s\n", command_output.Value());
      return false;
    }

    if (status != 0) {
		debug_printf(DEBUG_NORMAL, "Read from pipe: %s\n", 
					 command_output.Value());
		debug_printf( DEBUG_QUIET, "ERROR while running \"%s\": "
					  "my_pclose() failed with status %d (errno %d, %s)!\n",
					  cmd.Value(), status, errno, strerror( errno ) );
		return false;
    }
  }

  int	jobProcCount;
  if ( !parseFnc( keyLine.Value(), jobProcCount, condorID._cluster) ) {
		// We are going forward (do not return false here)
		// Expectation is that higher levels will catch that we
		// did not get a cluster initialized properly here, fail,
		// and write a rescue DAG. gt3658 2013-06-03
		//
		// This is better than the old failure that would submit
		// DAGMAN_MAX_SUBMIT_ATTEMPT copies of the same job.
      debug_printf( DEBUG_NORMAL, "WARNING: submit returned 0, but "
        "parsing submit output failed!\n" );
		// Returning here so we don't try to process invalid values
		// below.  (This should really return something like "submit failed
		// don't retry" -- see gittrac #3685.)
  	  return true;
  }

  	// Check for multiple job procs if configured to disallow that.
  if ( prohibitMultiJobs && (jobProcCount > 1) ) {
	debug_printf( DEBUG_NORMAL, "Submit generated %d job procs; "
				"disallowed by DAGMAN_PROHIBIT_MULTI_JOBS setting\n",
				jobProcCount );
	main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR );
  }
  
  return true;
}
Exemplo n.º 3
0
void condor_event_timer () {

	ASSERT( dagman.dag != NULL );

    //------------------------------------------------------------------------
    // Proceed with normal operation
    //
    // At this point, the DAG is bootstrapped.  All jobs premarked DONE
    // are in a STATUS_DONE state, and all their children have been
    // marked ready to submit.
    //
    // If recovery was needed, the log file has been completely read and
    // we are ready to proceed with jobs yet unsubmitted.
    //------------------------------------------------------------------------

	if( dagman.paused == true ) {
		debug_printf( DEBUG_DEBUG_1, "(DAGMan paused)\n" );
		return;
	}

    static int prevJobsDone = 0;
    static int prevJobs = 0;
    static int prevJobsFailed = 0;
    static int prevJobsSubmitted = 0;
    static int prevJobsReady = 0;
    static int prevScriptRunNodes = 0;
    static int prevJobsHeld = 0;

	int justSubmitted;
	justSubmitted = dagman.dag->SubmitReadyJobs(dagman);
	if( justSubmitted ) {
			// Note: it would be nice to also have the proc submit
			// count here.  wenger, 2006-02-08.
		debug_printf( DEBUG_VERBOSE, "Just submitted %d job%s this cycle...\n",
				  	justSubmitted, justSubmitted == 1 ? "" : "s" );
	}

	// If the log has grown
	if( dagman.dag->DetectCondorLogGrowth() ) {
		if( dagman.dag->ProcessLogEvents( CONDORLOG ) == false ) {
			debug_printf( DEBUG_NORMAL,
						"ProcessLogEvents(CONDORLOG) returned false\n" );
			dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
			main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR );
			return;
		}
	}

	if( dagman.dag->DetectDaPLogGrowth() ) {
		if( dagman.dag->ProcessLogEvents( DAPLOG ) == false ) {
			debug_printf( DEBUG_NORMAL,
						"ProcessLogEvents(DAPLOG) returned false\n" );
			dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
			main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_ERROR );
			return;
		}
	}

    // print status if anything's changed (or we're in a high debug level)
    if( prevJobsDone != dagman.dag->NumNodesDone( true )
        || prevJobs != dagman.dag->NumNodes( true )
        || prevJobsFailed != dagman.dag->NumNodesFailed()
        || prevJobsSubmitted != dagman.dag->NumJobsSubmitted()
        || prevJobsReady != dagman.dag->NumNodesReady()
        || prevScriptRunNodes != dagman.dag->ScriptRunNodeCount()
		|| prevJobsHeld != dagman.dag->NumHeldJobProcs()
		|| DEBUG_LEVEL( DEBUG_DEBUG_4 ) ) {
		print_status();

        prevJobsDone = dagman.dag->NumNodesDone( true );
        prevJobs = dagman.dag->NumNodes( true );
        prevJobsFailed = dagman.dag->NumNodesFailed();
        prevJobsSubmitted = dagman.dag->NumJobsSubmitted();
        prevJobsReady = dagman.dag->NumNodesReady();
        prevScriptRunNodes = dagman.dag->ScriptRunNodeCount();
		prevJobsHeld = dagman.dag->NumHeldJobProcs();
		
		if( dagman.dag->GetDotFileUpdate() ) {
			dagman.dag->DumpDotFile();
		}
	}

	dagman.dag->DumpNodeStatus( false, false );

    ASSERT( dagman.dag->NumNodesDone( true ) + dagman.dag->NumNodesFailed()
			<= dagman.dag->NumNodes( true ) );

    //
    // If DAG is complete, hurray, and exit.
    //
    if( dagman.dag->DoneSuccess( true ) ) {
        ASSERT( dagman.dag->NumJobsSubmitted() == 0 );
		dagman.dag->CheckAllJobs();
        debug_printf( DEBUG_NORMAL, "All jobs Completed!\n" );
		dagman.dag->PrintDeferrals( DEBUG_NORMAL, true );
		if ( dagman.dag->NumIdleJobProcs() != 0 ) {
			debug_printf( DEBUG_NORMAL, "Warning:  DAGMan thinks there "
						"are %d idle jobs, even though the DAG is "
						"completed!\n", dagman.dag->NumIdleJobProcs() );
			check_warning_strictness( DAG_STRICT_1 );
		}
		ExitSuccess();
		return;
    }

	//
	// DAG has failed -- dump rescue DAG.
	//
    if( dagman.dag->DoneFailed( true ) ) {
		main_shutdown_rescue( EXIT_ERROR, dagman.dag->_dagStatus );
		return;
	}

	//
	// DAG has succeeded but we haven't run final node yet, so do that.
	//
    if( dagman.dag->DoneSuccess( false ) ) {
		dagman.dag->StartFinalNode();
		return;
	}

		// If the DAG is halted, we don't want to actually exit yet if
		// jobs are still in the queue, or any POST scripts need to be
		// run (we need to run POST scripts so we don't "waste" jobs
		// that completed; on the other hand, we don't care about waiting
		// for PRE scripts because they'll be re-run when the rescue
		// DAG is run anyhow).
	if ( dagman.dag->IsHalted() && dagman.dag->NumJobsSubmitted() == 0 &&
				dagman.dag->PostRunNodeCount() == 0 &&
				!dagman.dag->RunningFinalNode() ) {
		debug_printf ( DEBUG_QUIET, "Exiting because DAG is halted "
					"and no jobs or scripts are running\n" );
		main_shutdown_rescue( EXIT_ERROR, Dag::DAG_STATUS_HALTED );
		return;
	}

    //
    // If no jobs are submitted and no scripts are running, but the
    // dag is not complete, then at least one job failed, or a cycle
    // exists.  (Note that if the DAG completed successfully, we already
	// returned from this function above.)
    // 
    if( dagman.dag->FinishedRunning( false ) ) {
		Dag::dag_status dagStatus = Dag::DAG_STATUS_OK;
		if( dagman.dag->DoneFailed( false ) ) {
			if( DEBUG_LEVEL( DEBUG_QUIET ) ) {
				debug_printf( DEBUG_QUIET,
							  "ERROR: the following job(s) failed:\n" );
				dagman.dag->PrintJobList( Job::STATUS_ERROR );
			}
			dagStatus = Dag::DAG_STATUS_NODE_FAILED;
		} else {
			// no jobs failed, so a cycle must exist
			debug_printf( DEBUG_QUIET, "ERROR: DAG finished but not all "
						"nodes are complete -- checking for a cycle...\n" );
			if( dagman.dag->isCycle() ) {
				debug_printf (DEBUG_QUIET, "... ERROR: a cycle exists "
							"in the dag, please check input\n");
				dagStatus = Dag::DAG_STATUS_CYCLE;
			} else {
				debug_printf (DEBUG_QUIET, "... ERROR: no cycle found; "
							"unknown error condition\n");
				dagStatus = Dag::DAG_STATUS_ERROR;
			}
			if ( debug_level >= DEBUG_NORMAL ) {
				dagman.dag->PrintJobList();
			}
		}

		main_shutdown_rescue( EXIT_ERROR, dagStatus );
		return;
    }
}