void CheckSpoolVersion( int spool_min_version_i_support, int spool_cur_version_i_support) { std::string spool; int spool_min_version; int spool_cur_version; ASSERT( param(spool,"SPOOL") ); CheckSpoolVersion(spool.c_str(),spool_min_version_i_support,spool_cur_version_i_support,spool_min_version,spool_cur_version); }
void main_init(int argc, char *argv[]) { _EXCEPT_Cleanup = ExceptCleanup; /* Start up with condor.condor privileges. */ set_condor_priv(); // Register a do-nothing reaper. This is just because the // file transfer object, which could be instantiated later, // registers its own reaper and does an EXCEPT if it gets // a reaper ID of 1 (since lots of other daemons have a reaper // ID of 1 hard-coded as special... this is bad). daemonCore->Register_Reaper("dummy_reaper", (ReaperHandler)&dummy_reaper, "dummy_reaper",NULL); // register SIGUSR1 (condor_rm) for shutdown... daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", (SignalHandler)&handleSignals,"handleSignals"); // register UPDATE_JOBAD for qedit changes daemonCore->Register_Signal( UPDATE_JOBAD, "UPDATE_JOBAD", (SignalHandler)&handleSignals,"handleSignals"); // handle daemoncore signals which are passed down daemonCore->Register_Signal( DC_SIGSUSPEND, "DC_SIGSUSPEND", (SignalHandler)&handleSignals,"handleSignals"); daemonCore->Register_Signal( DC_SIGCONTINUE, "DC_SIGCONTINUE", (SignalHandler)&handleSignals,"handleSignals"); int shadow_worklife = param_integer( "SHADOW_WORKLIFE", 3600 ); if( shadow_worklife > 0 ) { shadow_worklife_expires = time(NULL) + shadow_worklife; } else if( shadow_worklife == 0 ) { // run one job and then exit shadow_worklife_expires = time(NULL)-1; } else { shadow_worklife_expires = 0; } parseArgs( argc, argv ); CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS); ClassAd* ad = readJobAd(); if( ! ad ) { EXCEPT( "Failed to read job ad!" ); } startShadow( ad ); }
/*ARGSUSED*/ int main(int argc, char *argv[] ) { char *tmp = NULL; int reserved_swap, free_swap; char *host = NULL, *cluster = NULL, *proc = NULL; char *bogus_capability; int i; set_mySubSystem( "SHADOW", SUBSYSTEM_TYPE_SHADOW ); myDistro->Init( argc, argv ); if( argc == 2 && strncasecmp(argv[1], "-cl", 3) == MATCH ) { printClassAd(); exit( 0 ); } #if defined(SYSCALL_DEBUG) SyscallLabel = argv[0] + 7; #endif #if !defined(WIN32) install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN ); #endif if( argc > 1 ) { if( strcmp("-t",argv[1]) == MATCH ) { Termlog = 1; argv++; argc--; } } ShadowBDate = LastRestartTime = time(0); _EXCEPT_Cleanup = ExceptCleanup; MyPid = getpid(); config(); /* Start up with condor.condor privileges. */ /* we need to do this AFTER we call config() so that if CONDOR_IDS is being defined in the config file, we'll get the right value */ set_condor_priv(); if(Termlog) dprintf_set_tool_debug(get_mySubSystem()->getName(), 0); else dprintf_config( get_mySubSystem()->getName() ); DebugId = whoami; // create a database connection object dprintf( D_ALWAYS, "******* Standard Shadow starting up *******\n" ); dprintf( D_ALWAYS, "** %s\n", CondorVersion() ); dprintf( D_ALWAYS, "** %s\n", CondorPlatform() ); dprintf( D_ALWAYS, "*******************************************\n" ); reserved_swap = param_integer("RESERVED_SWAP", 0); reserved_swap *= 1024; /* megabytes -> kb */ bool use_sql_log = param_boolean("QUILL_USE_SQL_LOG", false); FILEObj = FILESQL::createInstance(use_sql_log); free_swap = sysapi_swap_space(); dprintf( D_FULLDEBUG, "*** Reserved Swap = %d\n", reserved_swap ); dprintf( D_FULLDEBUG, "*** Free Swap = %d\n", free_swap ); if( reserved_swap && free_swap < reserved_swap ) { dprintf( D_ALWAYS, "Not enough reserved swap space\n" ); if(FILEObj) { delete FILEObj; } exit( JOB_NO_MEM ); } dprintf(D_ALWAYS, "uid=%d, euid=%d, gid=%d, egid=%d\n", getuid(), geteuid(), getgid(), getegid()); dprintf(D_FULLDEBUG, "argc = %d\n", argc); for(i = 0; i < argc; i++) { dprintf(D_FULLDEBUG, "argv[%d] = %s\n", i, argv[i]); } if( argc < 6 ) { usage(); } if (param_boolean("SHADOW_DEBUG_WAIT", false, false)) { int debug_wait = 1; dprintf(D_ALWAYS, "SHADOW_DEBUG_WAIT is TRUE, waiting for debugger to attach to pid %d.\n", (int)::getpid()); while (debug_wait) { sleep(1); } } CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS); if( strcmp("-pipe",argv[1]) == 0 ) { bogus_capability = argv[2]; cluster = argv[3]; proc = argv[4]; // read the big comment in the function for why this is here. RemoveNewShadowDroppings(cluster, proc); pipe_setup( cluster, proc, bogus_capability ); } else { schedd = argv[1]; host = argv[2]; bogus_capability = argv[3]; cluster = argv[4]; proc = argv[5]; if ( argc > 6 ) { IpcFile = argv[6]; dprintf(D_FULLDEBUG,"Setting IpcFile to %s\n",IpcFile); } else { IpcFile = NULL; } // read the big comment in the function for why this is here. RemoveNewShadowDroppings(cluster, proc); regular_setup( host, cluster, proc ); } scheddName = getenv( EnvGetName( ENV_SCHEDD_NAME ) ); #if 0 /* Don't want to share log file lock between child and pnarent */ (void)close( LockFd ); LockFd = -1; #endif // initialize the user log initializeUserLog(); My_Filesystem_Domain = param( "FILESYSTEM_DOMAIN" ); dprintf( D_ALWAYS, "My_Filesystem_Domain = \"%s\"\n", My_Filesystem_Domain ); My_UID_Domain = param( "UID_DOMAIN" ); dprintf( D_ALWAYS, "My_UID_Domain = \"%s\"\n", My_UID_Domain ); UseAFS = param_boolean_crufty( "USE_AFS", false ) ? TRUE : FALSE; UseNFS = param_boolean_crufty( "USE_NFS", false ) ? TRUE : FALSE; // if job specifies a checkpoint server host, this overrides // the config file parameters tmp = NULL; if (JobAd->LookupString(ATTR_CKPT_SERVER, &tmp) == 1) { if (CkptServerHost) free(CkptServerHost); UseCkptServer = TRUE; CkptServerHost = strdup(tmp); StarterChoosesCkptServer = FALSE; free(tmp); } else { free(tmp); if (CkptServerHost) { free(CkptServerHost); } CkptServerHost = param( "CKPT_SERVER_HOST" ); UseCkptServer = FALSE; if( CkptServerHost && param_boolean_crufty( "USE_CKPT_SERVER", true ) ) { UseCkptServer = TRUE; } StarterChoosesCkptServer = param_boolean_crufty("STARTER_CHOOSES_CKPT_SERVER", true) ? TRUE : FALSE; } // Initialize location of our checkpoint file. If we stored it // on a checkpoint server then set LastCkptServer. Otherwise, // LastCkptServer should be NULL to indicate that we should // look on the local disk for our checkpoint file. LastCkptServer = NULL; if (JobAd->LookupString(ATTR_LAST_CKPT_SERVER, &LastCkptServer) == 0) { free(LastCkptServer); LastCkptServer = NULL; } // LIGO if (param_boolean("ALWAYS_USE_LOCAL_CKPT_SERVER", false)) { if (LastCkptServer) { char *remoteHost = NULL; JobAd->LookupString(ATTR_REMOTE_HOST, &remoteHost); char *machineName = strrchr(remoteHost, '@'); if (machineName == NULL) { machineName = remoteHost; } else { machineName++; } LastCkptServer = strdup(machineName); CkptServerHost = strdup(machineName); dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, forcing LastCkptServer to %s\n", LastCkptServer); } else { dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, but checkpoint is not on server, restoring file local file\n"); } } MaxDiscardedRunTime = param_integer( "MAX_DISCARDED_RUN_TIME", 3600 ); CompressPeriodicCkpt = param_boolean( "COMPRESS_PERIODIC_CKPT", false ); PeriodicSync = param_boolean( "PERIODIC_MEMORY_SYNC", false ); CompressVacateCkpt = param_boolean( "COMPRESS_VACATE_CKPT", false ); SlowCkptSpeed = param_integer( "SLOW_CKPT_SPEED", 0 ); // Install signal handlers such that all signals are blocked when inside // the handler. sigset_t fullset; sigfillset(&fullset); install_sig_handler_with_mask( SIGCHLD,&fullset, reaper ); // SIGUSR1 is sent by the schedd when a job is removed with // condor_rm. install_sig_handler_with_mask( SIGUSR1, &fullset, handle_sigusr1 ); // SIGQUIT is sent for a fast shutdow. install_sig_handler_with_mask( SIGQUIT, &fullset, handle_sigquit ); // SIGTERM is sent for a graceful shutdow. install_sig_handler_with_mask( SIGTERM, &fullset, handle_sigterm ); /* Here we block the async signals. We do this mainly because on HPUX, * XDR wigs out if it is interrupted by a signal. We do it on all * platforms because it seems like a safe idea. We unblock the signals * during before select(), which is where we spend most of our time. */ block_signal(SIGCHLD); block_signal(SIGUSR1); /* If the completed job had been committed to the job queue, but for some reason the shadow exited wierdly and the schedd is trying to run it again, then simply write the job termination events and send the email as if the job had just ended. */ if (terminate_is_pending() == TRUE) { /* This function will exit()! */ handle_terminate_pending(); } HandleSyscalls(); Wrapup(); /* HACK! WHOOO!!!!! Throw the old shadow away already! */ /* This will figure out whether or not the job should go on hold AFTER the job has exited for whatever reason, or if the job should be allowed to exit. It modifies ExitReason approriately for job holding, or, get this, just EXCEPTs if the jobs is supposed to go into idle state and not leave. :) */ /* Small change by Todd : only check the static policy if the job really exited of its own accord -- we don't want to even evaluate the static policy if the job exited because it was preempted, for instance */ if (check_static_policy && (ExitReason == JOB_EXITED || ExitReason == JOB_KILLED || ExitReason == JOB_COREDUMPED)) { static_policy(); } if( My_UID_Domain ) { free( My_UID_Domain ); } if( My_Filesystem_Domain ) { free( My_Filesystem_Domain ); } if(FILEObj) { delete FILEObj; } dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n", ExitReason ); exit( ExitReason ); }