void main_pre_dc_init( int, char*[] ) { DC_Skip_Auth_Init(); DC_Skip_Core_Init(); #ifdef WIN32 _setmaxstdio(2048); #endif // Convert the DAGMan log file name to an absolute path if it's // not one already, so that we'll log things to the right file // if we change to a different directory. const char * logFile = GetEnv( "_CONDOR_DAGMAN_LOG" ); if ( logFile && !fullpath( logFile ) ) { MyString currentDir; if ( condor_getcwd( currentDir ) ) { MyString newLogFile(currentDir); newLogFile += DIR_DELIM_STRING; newLogFile += logFile; SetEnv( "_CONDOR_DAGMAN_LOG", newLogFile.Value() ); } else { debug_printf( DEBUG_NORMAL, "ERROR: unable to get cwd: %d, %s\n", errno, strerror(errno) ); } } }
//------------------------------------------------------------------------- bool MakePathAbsolute(MyString &filePath, MyString &errMsg) { bool result = true; if ( !fullpath( filePath.Value() ) ) { MyString currentDir; if ( ! condor_getcwd( currentDir ) ) { formatstr( errMsg, "condor_getcwd() failed with errno %d (%s) at %s:%d", errno, strerror(errno), __FILE__, __LINE__ ); result = false; } filePath = currentDir + DIR_DELIM_STRING + filePath; } return result; }
bool MultiLogFiles::makePathAbsolute(MyString &filename, CondorError &errstack) { if ( !fullpath(filename.Value()) ) { // I'd like to use realpath() here, but I'm not sure // if that's portable across all platforms. wenger 2009-01-09. MyString currentDir; if ( !condor_getcwd(currentDir) ) { errstack.pushf( "MultiLogFiles", UTIL_ERR_GET_CWD, "ERROR: condor_getcwd() failed with errno %d (%s) at %s:%d", errno, strerror(errno), __FILE__, __LINE__); return false; } filename = currentDir + DIR_DELIM_STRING + filename; } return true; }
void XferSummary::init() { start_time = time(0); num_sends = 0; bytes_sent = 0; tot_send_bandwidth = 0; time_sending = 0; num_recvs = 0; bytes_recv = 0; tot_recv_bandwidth = 0; time_recving = 0; if( ! Collectors ) { Collectors = CollectorList::create(); } if( ! condor_getcwd( pwd ) ) { EXCEPT( "Can't get working directory." ); } }
extern "C" void log_termination (struct rusage *localr, struct rusage *remoter) { check_execute_event(); switch (WTERMSIG(JobStatus)) { case 0: case -1: // if core, bad exectuable --- otherwise, a normal exit if (WCOREDUMP(JobStatus) && WEXITSTATUS(JobStatus) == ENOEXEC) { // log the ULOG_EXECUTABLE_ERROR event ExecutableErrorEvent event; event.errType = CONDOR_EVENT_NOT_EXECUTABLE; if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS, "Unable to log NOT_EXECUTABLE event\n"); } } else if (WCOREDUMP(JobStatus) && WEXITSTATUS(JobStatus) == 0) { // log the ULOG_EXECUTABLE_ERROR event ExecutableErrorEvent event; event.errType = CONDOR_EVENT_BAD_LINK; if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS, "Unable to log BAD_LINK event\n"); } } else { // log the ULOG_JOB_TERMINATED event JobTerminatedEvent event; event.normal = true; // normal termination event.returnValue = WEXITSTATUS(JobStatus); event.total_local_rusage = Proc->local_usage; event.total_remote_rusage = Proc->remote_usage[0]; event.run_local_rusage = *localr; event.run_remote_rusage = *remoter; // we want to log the events from the perspective of the // user job, so if the shadow *sent* the bytes, then that // means the user job *received* the bytes event.recvd_bytes = BytesSent; event.sent_bytes = BytesRecvd; if (syscall_sock) { event.recvd_bytes += syscall_sock->get_bytes_sent(); event.sent_bytes += syscall_sock->get_bytes_recvd(); } event.total_recvd_bytes = TotalBytesSent + event.recvd_bytes; event.total_sent_bytes = TotalBytesRecvd + event.sent_bytes; if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS,"Unable to log ULOG_JOB_TERMINATED event\n"); } } break; case SIGKILL: // evicted without a checkpoint { JobEvictedEvent event; event.checkpointed = false; event.run_local_rusage = *localr; event.run_remote_rusage = *remoter; // we want to log the events from the perspective of the // user job, so if the shadow *sent* the bytes, then that // means the user job *received* the bytes event.recvd_bytes = BytesSent; event.sent_bytes = BytesRecvd; if (syscall_sock) { event.recvd_bytes += syscall_sock->get_bytes_sent(); event.sent_bytes += syscall_sock->get_bytes_recvd(); } if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS, "Unable to log ULOG_JOB_EVICTED event\n"); } } break; case SIGQUIT: // evicted, but *with* a checkpoint { JobEvictedEvent event; event.checkpointed = true; event.run_local_rusage = *localr; event.run_remote_rusage = *remoter; // we want to log the events from the perspective of the // user job, so if the shadow *sent* the bytes, then that // means the user job *received* the bytes event.recvd_bytes = BytesSent; event.sent_bytes = BytesRecvd; if (syscall_sock) { event.recvd_bytes += syscall_sock->get_bytes_sent(); event.sent_bytes += syscall_sock->get_bytes_recvd(); } if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS, "Unable to log ULOG_JOB_EVICTED event\n"); } } break; default: // abnormal termination { MyString coredir; MyString coreFile; JobTerminatedEvent event; event.normal = false; event.signalNumber = WTERMSIG(JobStatus); if (WCOREDUMP(JobStatus)) { /* look up the corefile name in the job ad if one exists... */ if (!JobAd->LookupString(ATTR_JOB_CORE_FILENAME, coreFile)) { /* if it didn't exist in the job ad, then construct what it should be. */ ASSERT( condor_getcwd(coredir) ); if (strcmp (Proc->rootdir, "/") == 0) { coreFile.formatstr( "%s/core.%d.%d", coredir.Value(), Proc->id.cluster, Proc->id.proc ); } else { coreFile.formatstr( "%s%s/core.%d.%d", Proc->rootdir, coredir.Value(), Proc->id.cluster, Proc->id.proc ); } } event.setCoreFile( coreFile.Value() ); } event.run_local_rusage = *localr; event.run_remote_rusage = *remoter; event.total_local_rusage = Proc->local_usage; event.total_remote_rusage = Proc->remote_usage[0]; // we want to log the events from the perspective of the // user job, so if the shadow *sent* the bytes, then that // means the user job *received* the bytes event.recvd_bytes = BytesSent; event.sent_bytes = BytesRecvd; if (syscall_sock) { event.recvd_bytes += syscall_sock->get_bytes_sent(); event.sent_bytes += syscall_sock->get_bytes_recvd(); } if (!ULog.writeEvent (&event)) { dprintf (D_ALWAYS,"Unable to log ULOG_JOB_TERMINATED event\n"); } } } }
void handle_termination( PROC *proc, char *notification, int *jobstatus, char const *coredir ) { MyString escapedbuf; int status = *jobstatus; MyString coredir_buf; dprintf(D_FULLDEBUG, "handle_termination() called.\n"); ASSERT (JobAd != NULL ); switch( WTERMSIG(status) ) { case -1: /* On Digital Unix, WTERMSIG returns -1 if we weren't killed by a sig. This is the same case as sig 0 */ case 0: /* If core, bad executable -- otherwise a normal exit */ if( WCOREDUMP(status) && WEXITSTATUS(status) == ENOEXEC ) { (void)sprintf( notification, "is not executable." ); dprintf( D_ALWAYS, "Shadow: Job file not executable\n" ); ExitReason = JOB_KILLED; } else if( WCOREDUMP(status) && WEXITSTATUS(status) == 0 ) { (void)sprintf(notification, "was killed because it was not properly linked for execution \nwith Version 6 Condor.\n" ); MainSymbolExists = FALSE; ExitReason = JOB_KILLED; } else { (void)sprintf(notification, "exited with status %d.", WEXITSTATUS(status) ); dprintf(D_ALWAYS, "Shadow: Job exited normally with status %d\n", WEXITSTATUS(status) ); ExitReason = JOB_EXITED; JobExitStatus = WEXITSTATUS(status); } proc->status = COMPLETED; proc->completion_date = time( (time_t *)0 ); JobAd->Assign(ATTR_ON_EXIT_BY_SIGNAL,false); JobAd->Assign( ATTR_ON_EXIT_CODE, WEXITSTATUS(status)); // set up the terminate pending "state" JobAd->Assign(ATTR_TERMINATION_PENDING,true); // this can have newlines and crap in it, for now, just replace them // with spaces. I know it is ugly, but the classadlog class which // writes the job queue log file can't have literal newlines and such // in the values. :( escapedbuf = notification; escapedbuf.replaceString("\n", " "); escapedbuf.replaceString("\t", " "); JobAd->Assign(ATTR_TERMINATION_REASON,escapedbuf.Value()); JobAd->Assign(ATTR_TERMINATION_EXITREASON, ExitReason); break; case SIGKILL: /* Kicked off without a checkpoint */ dprintf(D_ALWAYS, "Shadow: Job was kicked off without a checkpoint\n" ); DoCleanup(); ExitReason = JOB_NOT_CKPTED; #if 0 /* This is a problem for the new feature where we choose our executable dynamically, so don't do it. */ if( stat(ICkptName,&st_buf) < 0) { dprintf(D_ALWAYS,"No initial ckpt found\n"); ExitReason = JOB_NO_CKPT_FILE; } #endif /* in here, we disregard what the user wanted. Otherwise doing a condor_rm will result in the wanting to be resubmitted or held by the shadow. */ JobAd->Assign(ATTR_ON_EXIT_HOLD_CHECK,false); JobAd->Assign(ATTR_ON_EXIT_REMOVE_CHECK,true); // set up the terminate pending "state" JobAd->Assign(ATTR_TERMINATION_PENDING,true); JobAd->Assign(ATTR_TERMINATION_EXITREASON,ExitReason); break; case SIGQUIT: /* Kicked off, but with a checkpoint */ dprintf(D_ALWAYS, "Shadow: Job was checkpointed\n" ); proc->status = IDLE; ExitReason = JOB_CKPTED; /* in here, we disregard what the user wanted. Otherwise doing a condor_vacate will result in the wanting to be resubmitted or held by the shadow. */ JobAd->Assign(ATTR_ON_EXIT_HOLD_CHECK,false); // this can have newlines and crap in it, for now, just replace them // with spaces. I know it is ugly, but the classadlog class which // writes the job queue log file can't have literal newlines and such // in the values. :( escapedbuf = notification; escapedbuf.replaceString("\n", " "); escapedbuf.replaceString("\t", " "); JobAd->Assign(ATTR_TERMINATION_REASON, escapedbuf.Value()); JobAd->Assign(ATTR_ON_EXIT_REMOVE_CHECK,true); /* add in the signature of the checkpointing host for this completed ckpt */ if (LastCkptPlatform != NULL) { JobAd->Assign(ATTR_LAST_CHECKPOINT_PLATFORM,LastCkptPlatform); } break; default: /* Job exited abnormally */ if (coredir == NULL) { ASSERT( condor_getcwd(coredir_buf) ); coredir = coredir_buf.Value(); } if( WCOREDUMP(status) ) { MyString corepath; if( strcmp(proc->rootdir, "/") == 0 ) { (void)sprintf(notification, "was killed by signal %d.\nCore file is %s/core.%d.%d.", WTERMSIG(status) , coredir, proc->id.cluster, proc->id.proc); corepath.sprintf("%s/core.%d.%d", coredir, proc->id.cluster, proc->id.proc); } else { (void)sprintf(notification, "was killed by signal %d.\nCore file is %s%s/core.%d.%d.", WTERMSIG(status) ,proc->rootdir, coredir, proc->id.cluster, proc->id.proc); corepath.sprintf("%s%s/core.%d.%d", proc->rootdir, coredir, proc->id.cluster, proc->id.proc); } JobAd->Assign(ATTR_JOB_CORE_FILENAME,corepath.Value()); ExitReason = JOB_COREDUMPED; } else { (void)sprintf(notification, "was killed by signal %d.", WTERMSIG(status)); ExitReason = JOB_KILLED; } dprintf(D_ALWAYS, "Shadow: %s\n", notification); proc->status = COMPLETED; proc->completion_date = time( (time_t *)0 ); JobAd->Assign(ATTR_ON_EXIT_BY_SIGNAL,true); JobAd->Assign(ATTR_ON_EXIT_SIGNAL,WTERMSIG(status)); // set up the terminate pending "state" JobAd->Assign(ATTR_TERMINATION_PENDING,true); // this can have newlines and crap in it, for now, just replace them // with spaces. I know it is ugly, but the classadlog class which // writes the job queue log file can't have literal newlines and such // in the values. :( escapedbuf = notification; escapedbuf.replaceString("\n", " "); escapedbuf.replaceString("\t", " "); JobAd->Assign(ATTR_TERMINATION_REASON,escapedbuf.Value()); JobAd->Assign(ATTR_TERMINATION_EXITREASON, ExitReason); break; } }
/** Set up things in deep and shallow options that aren't directly specified on the command line. @param deepOpts: the condor_submit_dag deep options @param shallowOpts: the condor_submit_dag shallow options @return 0 if successful, 1 if failed */ int setUpOptions( SubmitDagDeepOptions &deepOpts, SubmitDagShallowOptions &shallowOpts ) { shallowOpts.strLibOut = shallowOpts.primaryDagFile + ".lib.out"; shallowOpts.strLibErr = shallowOpts.primaryDagFile + ".lib.err"; if ( deepOpts.strOutfileDir != "" ) { shallowOpts.strDebugLog = deepOpts.strOutfileDir + DIR_DELIM_STRING + condor_basename( shallowOpts.primaryDagFile.Value() ); } else { shallowOpts.strDebugLog = shallowOpts.primaryDagFile; } shallowOpts.strDebugLog += ".dagman.out"; shallowOpts.strSchedLog = shallowOpts.primaryDagFile + ".dagman.log"; shallowOpts.strSubFile = shallowOpts.primaryDagFile + DAG_SUBMIT_FILE_SUFFIX; MyString rescueDagBase; // If we're running each DAG in its own directory, write any rescue // DAG to the current directory, to avoid confusion (since the // rescue DAG must be run from the current directory). if ( deepOpts.useDagDir ) { if ( !condor_getcwd( rescueDagBase ) ) { fprintf( stderr, "ERROR: unable to get cwd: %d, %s\n", errno, strerror(errno) ); return 1; } rescueDagBase += DIR_DELIM_STRING; rescueDagBase += condor_basename(shallowOpts.primaryDagFile.Value()); } else { rescueDagBase = shallowOpts.primaryDagFile; } // If we're running multiple DAGs, put "_multi" in the rescue // DAG name to indicate that the rescue DAG is for *all* of // the DAGs we're running. if ( shallowOpts.dagFiles.number() > 1 ) { rescueDagBase += "_multi"; } shallowOpts.strRescueFile = rescueDagBase + ".rescue"; shallowOpts.strLockFile = shallowOpts.primaryDagFile + ".lock"; if (deepOpts.strDagmanPath == "" ) { deepOpts.strDagmanPath = which( dagman_exe ); } if (deepOpts.strDagmanPath == "") { fprintf( stderr, "ERROR: can't find %s in PATH, aborting.\n", dagman_exe ); return 1; } MyString msg; if ( !GetConfigFile( shallowOpts.dagFiles, deepOpts.useDagDir, shallowOpts.strConfigFile, msg) ) { fprintf( stderr, "ERROR: %s\n", msg.Value() ); return 1; } return 0; }
void main_pre_dc_init( int argc, char* argv[] ) { param_functions *p_funcs = NULL; // figure out what get_mySubSystem() should be based on argv[0], or // if we see "-gridshell" anywhere on the command-line const char* base = condor_basename(argv[0]); char const *tmp; tmp = strrchr(base, '_' ); if( tmp && strncasecmp(tmp, "_gridshell", 10) == MATCH ) { get_mySubSystem()->setName( "GRIDSHELL" ); is_gridshell = true; } else { int i, len; for( i=1; i<argc; i++ ) { len = strlen(argv[i]); if( len < 3 ) { // ambiguous, bail out, since we don't want to get // confused with just "-" or something continue; } if( strncasecmp(argv[i], "-gridshell", MIN(len,10)) == MATCH ) { get_mySubSystem()->setName( "GRIDSHELL" ); is_gridshell = true; break; } } } if( ! is_gridshell ) { get_mySubSystem()->setName( "STARTER" ); } // if we were passed "-classad", just print our classad and // exit, without going back to daemoncore or anything. we // need to do this *after* we set get_mySubSystem(), since this ends // up calling functions that rely on it being defined... if( argc == 2 && strncasecmp(argv[1],"-cla",4) == MATCH ) { // needed for Java stuff config(true); // Would like to log errors to stderr if stderr is not // /dev/null to make it easier for users to debug, but not // sure how to do that on windows. On Windows, when // condor_starter is run by the startd, setting Termlog=1 // causes a dprintf to abort with an error if any calls to // dprintf() are made in a debug level that is turned on. // I have not found a way to detect when stderr is in this // state, so I am just leaving Termlog turned off in all // cases. //Termlog = 1; p_funcs = get_param_functions(); dprintf_config(get_mySubSystem()->getName(), p_funcs); printClassAd(); exit(0); } // if we're still here, stash the cwd for future reference MyString cwd; if( ! condor_getcwd(cwd)) { dprintf( D_ALWAYS, "ERROR calling getcwd(): %s (errno %d)\n", strerror(errno), errno ); } else { orig_cwd = strdup(cwd.Value()); } // if we're the gridshell, assume a "-f" option. all that // does in DaemonCore-land is set a global variable, so we'll // just do that here, ourselves... if( is_gridshell ) { Foreground = 1; } // finally, dup() our standard file streams, so we can pass // those onto the actual user job if requested. starter_stdin_fd = dup( 0 ); starter_stdout_fd = dup( 1 ); starter_stderr_fd = dup( 2 ); }
// Gets the log files from a Stork submit file. MyString MultiLogFiles::loadLogFileNamesFromStorkSubFile( const MyString &strSubFilename, const MyString &directory, StringList &listLogFilenames) { MyString rtnVal; MyString path; std::string adBuf; classad::ClassAdParser parser; classad::PrettyPrint unparser; std::string unparsed; dprintf( D_FULLDEBUG, "MultiLogFiles::loadLogFileNamesFromStorkSubFile(%s, %s)\n", strSubFilename.Value(), directory.Value() ); // Construct fully qualified path from directory and log file. if ( directory.Length() > 0 ) { path = directory + DIR_DELIM_STRING; } path += strSubFilename; // Read submit file into std::string buffer, the native input buffer for // the [new] ClassAds parser. rtnVal = MultiLogFiles::readFile( path.Value(), adBuf); if (rtnVal.Length() > 0 ) { return rtnVal; } // read all classads out of the input file int offset = 0; classad::ClassAd ad; // Loop through the Stork submit file, parsing out one submit job [ClassAd] // at a time. skip_whitespace(adBuf,offset); // until the parser can do this itself while (parser.ParseClassAd(adBuf, ad, offset) ) { std::string logfile; // ad now contains the next Stork job ClassAd. Extract log file, if // found. if ( ! ad.EvaluateAttrString("log", logfile) ) { // no log file specified continue; } // reject empty log file names if ( logfile.empty() ) { unparser.Unparse( unparsed, &ad); rtnVal.formatstr("Stork job specifies null log file:%s", unparsed.c_str() ); return rtnVal; } // reject log file names with embedded macros if ( logfile.find('$') != std::string::npos) { unparser.Unparse( unparsed, &ad); rtnVal.formatstr("macros not allowed in Stork log file names:%s", unparsed.c_str() ); return rtnVal; } // All logfile must be fully qualified paths. Prepend the current // working directory if logfile not a fully qualified path. if ( ! fullpath(logfile.c_str() ) ) { MyString currentDir; if ( ! condor_getcwd(currentDir) ) { rtnVal.formatstr("condor_getcwd() failed with errno %d (%s)", errno, strerror(errno)); dprintf(D_ALWAYS, "ERROR: %s at %s:%d\n", rtnVal.Value(), __FILE__, __LINE__); return rtnVal; } std::string tmp = currentDir.Value(); tmp += DIR_DELIM_STRING; tmp += logfile; logfile = tmp; } // Add the log file we just found to the log file list // (if it's not already in the list -- we don't want // duplicates). listLogFilenames.rewind(); char *psLogFilename; bool bAlreadyInList = false; while ( (psLogFilename = listLogFilenames.next()) ) { if (logfile == psLogFilename) { bAlreadyInList = true; } } if (!bAlreadyInList) { // Note: append copies the string here. listLogFilenames.append(logfile.c_str() ); } skip_whitespace(adBuf,offset); // until the parser can do this itself } return rtnVal; }
//--------------------------------------------------------------------------- void main_init (int argc, char ** const argv) { printf ("Executing condor dagman ... \n"); // flag used if DAGMan is invoked with -WaitForDebug so we // wait for a developer to attach with a debugger... volatile int wait_for_debug = 0; // process any config vars -- this happens before we process // argv[], since arguments should override config settings dagman.Config(); // The DCpermission (last parm) should probably be PARENT, if it existed daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", (SignalHandler) main_shutdown_remove, "main_shutdown_remove", NULL); /****** FOR TESTING ******* daemonCore->Register_Signal( SIGUSR2, "SIGUSR2", (SignalHandler) main_testing_stub, "main_testing_stub", NULL); ****** FOR TESTING ********/ debug_progname = condor_basename(argv[0]); // condor_submit_dag version from .condor.sub bool allowVerMismatch = false; const char *csdVersion = "undefined"; int i; for (i = 0 ; i < argc ; i++) { debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] ); } if (argc < 2) Usage(); // Make sure an input file was specified // get dagman job id from environment, if it's there // (otherwise it will be set to "-1.-1.-1") dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Minimum legal version for a .condor.sub file to be compatible // with this condor_dagman binary. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Be sure to change this if the arguments or environment // passed to condor_dagman change in an incompatible way!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! struct DagVersionData { int majorVer; int minorVer; int subMinorVer; }; const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 }; // Construct a string of the minimum submit file version. MyString minSubmitVersionStr; minSubmitVersionStr.formatstr( "%d.%d.%d", MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // Process command-line arguments // for (i = 1; i < argc; i++) { if( !strcasecmp( "-Debug", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No debug level specified\n" ); Usage(); } debug_level = (debug_level_t) atoi (argv[i]); } else if( !strcasecmp( "-Lockfile", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" ); Usage(); } lockFileName = argv[i]; } else if( !strcasecmp( "-Help", argv[i] ) ) { Usage(); } else if (!strcasecmp( "-Dag", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DAG specified\n" ); Usage(); } dagman.dagFiles.append( argv[i] ); } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxIdle\n" ); Usage(); } dagman.maxIdle = atoi( argv[i] ); } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxJobs\n" ); Usage(); } dagman.maxJobs = atoi( argv[i] ); } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) { debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with " "-MaxPre and -MaxPost arguments\n" ); Usage(); } else if( !strcasecmp( "-MaxPre", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPre\n" ); Usage(); } dagman.maxPreScripts = atoi( argv[i] ); } else if( !strcasecmp( "-MaxPost", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPost\n" ); Usage(); } dagman.maxPostScripts = atoi( argv[i] ); } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) { debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is " "ignored; please use the DAGMAN_ALLOW_EVENTS " "config parameter instead\n"); check_warning_strictness( DAG_STRICT_1 ); } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) { dagman.allowLogError = true; } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) { dagman._runPost = false; } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) { wait_for_debug = 1; } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) { dagman.useDagDir = true; } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" ); Usage(); } dagman.autoRescue = (atoi( argv[i] ) != 0); } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" ); Usage(); } dagman.doRescueFrom = atoi (argv[i]); } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" ); Usage(); } csdVersion = argv[i]; } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) { allowVerMismatch = true; } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) { dagman.dumpRescueDag = true; } else if( !strcasecmp( "-verbose", argv[i] ) ) { dagman._submitDagDeepOpts.bVerbose = true; } else if( !strcasecmp( "-force", argv[i] ) ) { dagman._submitDagDeepOpts.bForce = true; } else if( !strcasecmp( "-notification", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No notification value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strNotification = argv[i]; } else if( !strcasecmp( "-dagman", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No dagman value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strDagmanPath = argv[i]; } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strOutfileDir = argv[i]; } else if( !strcasecmp( "-update_submit", argv[i] ) ) { dagman._submitDagDeepOpts.updateSubmit = true; } else if( !strcasecmp( "-import_env", argv[i] ) ) { dagman._submitDagDeepOpts.importEnv = true; } else if( !strcasecmp( "-priority", argv[i] ) ) { ++i; if( i >= argc || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_NORMAL, "No priority value specified\n"); Usage(); } dagman._submitDagDeepOpts.priority = atoi(argv[i]); } else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) { dagman._submitDagDeepOpts.always_use_node_log = false; } else { debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n", argv[i] ); Usage(); } } dagman.dagFiles.rewind(); dagman.primaryDagFile = dagman.dagFiles.next(); dagman.multiDags = (dagman.dagFiles.number() > 1); MyString tmpDefaultLog; if ( dagman._defaultNodeLog != NULL ) { tmpDefaultLog = dagman._defaultNodeLog; free( dagman._defaultNodeLog ); } else { tmpDefaultLog = dagman.primaryDagFile + ".nodes.log"; } // Force default log file path to be absolute so it works // with -usedagdir and DIR nodes. CondorError errstack; if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) { debug_printf( DEBUG_QUIET, "Unable to convert default log " "file name to absolute path: %s\n", errstack.getFullText().c_str() ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR ); DC_Exit( EXIT_ERROR ); } dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() ); debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n", dagman._defaultNodeLog); // // Check the arguments // //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Checking for version compatibility between the .condor.sub // file and this condor_dagman binary... // Note: if we're in recovery mode and the submit file version // causes us to quit, we leave any existing node jobs still // running -- may want to change that eventually. wenger 2009-10-13. // Version of the condor_submit_dag that created our submit file. CondorVersionInfo submitFileVersion( csdVersion ); // Version of this condor_dagman binary. CondorVersionInfo dagmanVersion; // Just generate this message fragment in one place. MyString versionMsg; versionMsg.formatstr("the version (%s) of this DAG's Condor submit " "file (created by condor_submit_dag)", csdVersion ); // Make sure version in submit file is valid. if( !submitFileVersion.is_valid() ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n", versionMsg.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; " "continuing because of -AllowVersionMismatch flag\n", versionMsg.Value() ); } // Make sure .condor.sub file is recent enough. } else if ( submitFileVersion.compare_versions( CondorVersion() ) != 0 ) { if( !submitFileVersion.built_since_version( MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is older than " "oldest permissible version (%s)\n", versionMsg.Value(), minSubmitVersionStr.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is older than " "oldest permissible version (%s); continuing " "because of -AllowVersionMismatch flag\n", versionMsg.Value(), minSubmitVersionStr.Value() ); } // Warn if .condor.sub file is a newer version than this binary. } else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) { debug_printf( DEBUG_NORMAL, "Warning: %s is newer than " "condor_dagman version (%s)\n", versionMsg.Value(), CondorVersion() ); check_warning_strictness( DAG_STRICT_3 ); } else { debug_printf( DEBUG_NORMAL, "Note: %s differs from " "condor_dagman version (%s), but the " "difference is permissible\n", versionMsg.Value(), CondorVersion() ); } } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if( dagman.primaryDagFile == "" ) { debug_printf( DEBUG_SILENT, "No DAG file was specified\n" ); Usage(); } if (lockFileName == NULL) { debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" ); Usage(); } if( dagman.maxJobs < 0 ) { debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n"); Usage(); } if( dagman.maxPreScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" ); Usage(); } if( dagman.maxPostScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" ); Usage(); } if( dagman.doRescueFrom < 0 ) { debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" ); Usage(); } debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n", lockFileName ); if ( dagman.dagFiles.number() == 1 ) { debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n", dagman.primaryDagFile.Value() ); } else { MyString msg = "DAG Input files are "; dagman.dagFiles.rewind(); const char *dagFile; while ( (dagFile = dagman.dagFiles.next()) != NULL ) { msg += dagFile; msg += " "; } msg += "\n"; debug_printf( DEBUG_VERBOSE, "%s", msg.Value() ); } // if requested, wait for someone to attach with a debugger... while( wait_for_debug ) { } { MyString cwd; if( !condor_getcwd(cwd) ) { cwd = "<null>"; } debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value()); char *temp = my_username(); debug_printf( DEBUG_DEBUG_1, "Current user is %s\n", temp ? temp : "<null>" ); if( temp ) { free( temp ); } } // // Figure out the rescue DAG to run, if any (this is with "new- // style" rescue DAGs). // int rescueDagNum = 0; MyString rescueDagMsg; if ( dagman.doRescueFrom != 0 ) { rescueDagNum = dagman.doRescueFrom; rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum ); RenameRescueDagsAfter( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum ); } else if ( dagman.autoRescue ) { rescueDagNum = FindLastRescueDagNum( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum ); rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum ); } // // Fill in values in the deep submit options that we haven't // already set. // dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError; dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir; dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue; dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom; dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch; dagman._submitDagDeepOpts.recurse = false; // // Create the DAG // // Note: a bunch of the parameters we pass here duplicate things // in submitDagOpts, but I'm keeping them separate so we don't have to // bother to construct a new SubmitDagOtions object for splices. // wenger 2010-03-25 dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs, dagman.maxPreScripts, dagman.maxPostScripts, dagman.allowLogError, dagman.useDagDir, dagman.maxIdle, dagman.retrySubmitFirst, dagman.retryNodeFirst, dagman.condorRmExe, dagman.storkRmExe, &dagman.DAGManJobId, dagman.prohibitMultiJobs, dagman.submitDepthFirst, dagman._defaultNodeLog, dagman._generateSubdagSubmits, &dagman._submitDagDeepOpts, false ); /* toplevel dag! */ if( dagman.dag == NULL ) { EXCEPT( "ERROR: out of memory!\n"); } dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit ); dagman.dag->SetAllowEvents( dagman.allow_events ); dagman.dag->SetConfigFile( dagman._dagmanConfigFile ); dagman.dag->SetMaxJobHolds( dagman._maxJobHolds ); dagman.dag->SetPostRun(dagman._runPost); if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority); } else if( dagman._defaultPriority != 0 ) { // From config file dagman.dag->SetDefaultPriority(dagman._defaultPriority); dagman._submitDagDeepOpts.priority = dagman._defaultPriority; } // // Parse the input files. The parse() routine // takes care of adding jobs and dependencies to the DagMan // dagman.mungeNodeNames = (dagman.dagFiles.number() > 1); parseSetDoNameMunge( dagman.mungeNodeNames ); debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", dagman.dagFiles.number() ); dagman.dagFiles.rewind(); char *dagFile; // Here we make a copy of the dagFiles for iteration purposes. Deep inside // of the parsing, copies of the dagman.dagFile string list happen which // mess up the iteration of this list. StringList sl( dagman.dagFiles ); sl.rewind(); while ( (dagFile = sl.next()) != NULL ) { debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile ); if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } if( dagman.dag->GetDefaultPriority() != 0 ) { dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag } dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId ); if ( rescueDagNum > 0 ) { // Get our Pegasus sequence numbers set correctly. dagman.dag->GetJobstateLog().InitializeRescue(); } // lift the final set of splices into the main dag. dagman.dag->LiftSplices(SELF); // // Actually parse the "new-new" style (partial DAG info only) // rescue DAG here. Note: this *must* be done after splices // are lifted! // if ( rescueDagNum > 0 ) { dagman.rescueFileToRun = RescueDagName( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum ); debug_printf ( DEBUG_QUIET, "%s; running %s in combination with " "normal DAG file%s\n", rescueDagMsg.Value(), dagman.rescueFileToRun.Value(), dagman.multiDags ? "s" : ""); debug_printf ( DEBUG_QUIET, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n", dagman.rescueFileToRun.Value() ); // Turn off node name munging for the rescue DAG, because // it will already have munged node names. parseSetDoNameMunge( false ); if( !parse( dagman.dag, dagman.rescueFileToRun.Value(), dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } dagman.dag->CheckThrottleCats(); // fix up any use of $(JOB) in the vars values for any node dagman.dag->ResolveVarsInterpolations(); /* debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/ /* dagman.dag->PrintJobList();*/ #ifndef NOT_DETECT_CYCLE if( dagman.startup_cycle_detect && dagman.dag->isCycle() ) { // Note: maybe we should run the final node here, if there is one. // wenger 2011-12-19. debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n"); } #endif debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n", dagman.dag->NumNodes( true ) ); MyString firstLocation; if ( dagman.dag->GetReject( firstLocation ) ) { debug_printf( DEBUG_QUIET, "Exiting because of REJECT " "specification in %s. This most likely means " "that the DAG file was produced with the -DumpRescue " "flag when parsing the original DAG failed.\n", firstLocation.Value() ); DC_Exit( EXIT_ERROR ); return; } dagman.dag->DumpDotFile(); if ( dagman.dumpRescueDag ) { debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, false, false ); ExitSuccess(); return; } //------------------------------------------------------------------------ // Bootstrap and Recovery // // If the Lockfile exists, this indicates a premature termination // of a previous run of Dagman. If condor log is also present, // we run in recovery mode // If the Daglog is not present, then we do not run in recovery // mode { bool recovery = access(lockFileName, F_OK) == 0; if (recovery) { debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n", lockFileName); if (dagman.abortDuplicates) { if (util_check_lock_file(lockFileName) == 1) { debug_printf( DEBUG_QUIET, "Aborting because it " "looks like another instance of DAGMan is " "currently running on this DAG; if that is " "not the case, delete the lock file (%s) " "and re-submit the DAG.\n", lockFileName ); dagman.dag->GetJobstateLog(). WriteDagmanFinished( EXIT_RESTART ); dagman.CleanUp(); DC_Exit( EXIT_ERROR ); // We should never get to here! } } } // // If this DAGMan continues, it should overwrite the lock // file if it exists. // util_create_lock_file(lockFileName, dagman.abortDuplicates); debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n"); if( !dagman.dag->Bootstrap( recovery ) ) { dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n"); } } debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" ); daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, condor_event_timer, "condor_event_timer" ); dagman.dag->SetPendingNodeReportInterval( dagman.pendingReportInterval ); }