void BaseShadow::reconnectFailed( const char* reason ) { // try one last time to release the claim, write a UserLog event // about it, and exit with a special status. dprintf( D_ALWAYS, "Reconnect FAILED: %s\n", reason ); logReconnectFailedEvent( reason ); // if the shadow was born disconnected, exit with // JOB_RECONNECT_FAILED so the schedd can make // an accurate restart report. otherwise just // exist with JOB_SHOULD_REQUEUE. if ( attemptingReconnectAtStartup ) { dprintf(D_ALWAYS,"Exiting with JOB_RECONNECT_FAILED\n"); // does not return DC_Exit( JOB_RECONNECT_FAILED ); } else { dprintf(D_ALWAYS,"Exiting with JOB_SHOULD_REQUEUE\n"); // does not return DC_Exit( JOB_SHOULD_REQUEUE ); } // Should never get here.... ASSERT(true); }
void main_shutdown_rescue( int exitVal, Dag::dag_status dagStatus ) { // Avoid possible infinite recursion if you hit a fatal error // while writing a rescue DAG. static bool inShutdownRescue = false; if ( inShutdownRescue ) { return; } inShutdownRescue = true; dagman.dag->_dagStatus = dagStatus; debug_printf( DEBUG_QUIET, "Aborting DAG...\n" ); // Avoid writing two different rescue DAGs if the "main" DAG and // the final node (if any) both fail. static bool wroteRescue = false; if( dagman.dag ) { // We write the rescue DAG *before* removing jobs because // otherwise if we crashed, failed, or were killed while // removing them, we would leave the DAG in an // unrecoverable state... if( exitVal != 0 ) { if ( dagman.maxRescueDagNum > 0 ) { dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, wroteRescue, false, dagman._writePartialRescueDag ); wroteRescue = true; } else { debug_printf( DEBUG_QUIET, "No rescue DAG written because " "DAGMAN_MAX_RESCUE_NUM is 0\n" ); } } debug_printf( DEBUG_DEBUG_1, "We have %d running jobs to remove\n", dagman.dag->NumJobsSubmitted() ); if( dagman.dag->NumJobsSubmitted() > 0 ) { debug_printf( DEBUG_NORMAL, "Removing submitted jobs...\n" ); dagman.dag->RemoveRunningJobs(dagman); } if ( dagman.dag->NumScriptsRunning() > 0 ) { debug_printf( DEBUG_NORMAL, "Removing running scripts...\n" ); dagman.dag->RemoveRunningScripts(); } dagman.dag->PrintDeferrals( DEBUG_NORMAL, true ); // Start the final node if we have one. if ( dagman.dag->StartFinalNode() ) { // We started a final node; return here so we wait for the // final node to finish, instead of exiting immediately. inShutdownRescue = false; return; } dagman.dag->DumpNodeStatus( false, true ); dagman.dag->GetJobstateLog().WriteDagmanFinished( exitVal ); } MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); inShutdownRescue = false; DC_Exit( exitVal ); }
// this can be called by other functions, or by DC when the schedd is // shutdown gracefully void main_shutdown_graceful() { print_status(); dagman.dag->DumpNodeStatus( true, false ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_RESTART ); dagman.CleanUp(); DC_Exit( EXIT_RESTART ); }
int request_pipe_handler(Service*, int) { std::string* next_line; while ((next_line = request_buffer.GetNextLine()) != NULL) { dprintf (D_FULLDEBUG, "got work request: %s\n", next_line->c_str()); Gahp_Args args; // Parse the command... if (!(parse_gahp_command (next_line->c_str(), &args) && handle_gahp_command (args.argv, args.argc))) { dprintf (D_ALWAYS, "ERROR processing %s\n", next_line->c_str()); } // Clean up... delete next_line; } // check for an error in GetNextLine if (request_buffer.IsError() || request_buffer.IsEOF()) { dprintf (D_ALWAYS, "Request pipe closed. Exiting...\n"); DC_Exit (1); } return TRUE; }
int VMGahp::quitFast() { cleanUp(); DC_Exit(0); return TRUE; }
int VMGahp::waitForCommand(int /*pipe_end*/) { MyString *line = NULL; while((line = m_request_buffer.GetNextLine()) != NULL) { const char *command = line->Value(); Gahp_Args args; VMRequest *new_req = NULL; if( m_inClassAd ) { if( strcasecmp(command, VMGAHP_COMMAND_CLASSAD_END) == 0 ) { m_inClassAd = false; // Everything is Ok. Now we got vmClassAd returnOutputSuccess(); }else { if( !m_jobAd->Insert(command) ) { vmprintf(D_ALWAYS, "Failed to insert \"%s\" into classAd, " "ignoring this attribute\n", command); } } }else { if(parse_vmgahp_command(command, args) && verifyCommand(args.argv, args.argc)) { new_req = preExecuteCommand(command, &args); if( new_req != NULL ) { // Execute the new request executeCommand(new_req); if(new_req->m_has_result) { movePendingReqToResultList(new_req); if (m_async_mode) { if (!m_new_results_signaled) { write_to_daemoncore_pipe("R\n"); } // So that we only do it once m_new_results_signaled = true; } } } }else { returnOutputError(); } } delete line; line = NULL; } // check if GetNextLine() returned NULL because of an error or EOF if(m_request_buffer.IsError() || m_request_buffer.IsEOF()) { vmprintf(D_ALWAYS, "Request buffer closed, exiting\n"); cleanUp(); DC_Exit(0); } return true; }
void VMGahp::executeQuit(void) { m_need_output_for_quit = true; cleanUp(); DC_Exit(0); }
void BaseShadow::holdJob( const char* reason, int hold_reason_code, int hold_reason_subcode ) { dprintf( D_ALWAYS, "Job %d.%d going into Hold state (code %d,%d): %s\n", getCluster(), getProc(), hold_reason_code, hold_reason_subcode,reason ); if( ! jobAd ) { dprintf( D_ALWAYS, "In HoldJob() w/ NULL JobAd!" ); DC_Exit( JOB_SHOULD_HOLD ); } // cleanup this shadow (kill starters, etc) cleanUp( jobWantsGracefulRemoval() ); // Put the reason in our job ad. jobAd->Assign( ATTR_HOLD_REASON, reason ); jobAd->Assign( ATTR_HOLD_REASON_CODE, hold_reason_code ); jobAd->Assign( ATTR_HOLD_REASON_SUBCODE, hold_reason_subcode ); // try to send email (if the user wants it) emailHoldEvent( reason ); // update the job queue for the attributes we care about if( !updateJobInQueue(U_HOLD) ) { // trouble! TODO: should we do anything else? dprintf( D_ALWAYS, "Failed to update job queue!\n" ); } }
//--------------------------------------------------------------------------- static void Usage() { debug_printf( DEBUG_SILENT, "\nUsage: condor_dagman -f -t -l .\n" "\t\t-Lockfile <NAME.dag.lock>\n" "\t\t-Dag <NAME.dag>\n" "\t\t-CsdVersion <version string>\n" "\t\t[-Debug <level>]\n" "\t\t[-MaxIdle <int N>]\n" "\t\t[-MaxJobs <int N>]\n" "\t\t[-MaxPre <int N>]\n" "\t\t[-MaxPost <int N>]\n" "\t\t[-DontAlwaysRunPost]\n" "\t\t[-WaitForDebug]\n" "\t\t[-NoEventChecks]\n" "\t\t[-AllowLogError]\n" "\t\t[-UseDagDir]\n" "\t\t[-AutoRescue <0|1>]\n" "\t\t[-DoRescueFrom <int N>]\n" "\t\t[-Priority <int N>]\n" "\t\t[-AllowVersionMismatch]\n" "\t\t[-DumpRescue]\n" "\t\t[-Verbose]\n" "\t\t[-Force]\n" "\t\t[-Notification <never|always|complete|error>]\n" "\t\t[-Dagman <dagman_executable>]\n" "\t\t[-Outfile_dir <directory>]\n" "\t\t[-Update_submit]\n" "\t\t[-Import_env]\n" "\twhere NAME is the name of your DAG.\n" "\tdefault -Debug is -Debug %d\n", DEBUG_NORMAL); DC_Exit( EXIT_ERROR ); }
void BaseShadow::shutDown( int reason ) { // exit now if there is no job ad if ( !getJobAd() ) { DC_Exit( reason ); } // if we are being called from the exception handler, return // now to prevent infinite loop in case we call EXCEPT below. if ( reason == JOB_EXCEPTION ) { return; } // Only if the job is trying to leave the queue should we // evaluate the user job policy... if( reason == JOB_EXITED || reason == JOB_COREDUMPED ) { if( !waitingToUpdateSchedd() ) { shadow_user_policy.checkAtExit(); // WARNING: 'this' may have been deleted by the time we get here!!! } } else { // if we aren't trying to evaluate the user's policy, we just // want to evict this job. evictJob( reason ); } }
void BaseShadow::removeJob( const char* reason ) { this->removeJobPre(reason); // does not return. DC_Exit( JOB_SHOULD_REMOVE ); }
int master_exit(int retval) { cleanup_memory(); #ifdef WIN32 if ( NT_ServiceFlag == TRUE ) { terminate(retval); } #endif #if defined(WANT_CONTRIB) && defined(WITH_MANAGEMENT) #if defined(HAVE_DLOPEN) || defined(WIN32) MasterPluginManager::Shutdown(); #endif #endif // If we're positive that we are going to shut down, // we should clean out the shared port directory if // we created it. std::string dirname; if ( SharedPortEndpoint::CreatedSharedPortDirectory() && SharedPortEndpoint::GetDaemonSocketDir(dirname) ) { TemporaryPrivSentry tps(PRIV_CONDOR); Directory d(dirname.c_str()); d.Remove_Entire_Directory(); if (-1 == rmdir(dirname.c_str())) { dprintf(D_ALWAYS, "ERROR: failed to remove shared port temporary directory: %s (errno=%d).\n", strerror(errno), errno); } } DC_Exit(retval, shutdown_program ); return 1; // just to satisfy vc++ }
void main_shutdown_graceful( ) { dprintf( D_ALWAYS, "main_shutdown_graceful started\n" ); delete stateMachine; DC_Exit( 0 ); }
void BaseShadow::holdJobAndExit( const char* reason, int hold_reason_code, int hold_reason_subcode ) { holdJob(reason,hold_reason_code,hold_reason_subcode); // finally, exit and tell the schedd what to do DC_Exit( JOB_SHOULD_HOLD ); }
void main_shutdown_graceful() { #ifndef WIN32 delete xinter; #endif DC_Exit(EXIT_SUCCESS); }
void usage() { dprintf( D_ALWAYS, "Usage: condor_ft-gahp\n" ); DC_Exit( 1 ); }
void main_init(int argc , char * argv []) { char *testfile = NULL; ClassAd *inputAd = NULL; int i; dprintf(D_ALWAYS, "main_init() called\n"); for (i=1; i<argc; i++ ) { if (match_prefix(argv[i],"-withfile")) { i++; if (argc <= i) { fprintf(stderr, "ERROR: Argument -withfile requires a parameter\n "); exit(1); } testfile = argv[i]; } } // end of parsing command line options if ( testfile ) { FILE* fp = safe_fopen_wrapper(testfile,"r"); if (!fp) { fprintf(stderr,"ERROR: Unable to open test file %s\n", testfile); DC_Exit(1); } int EndFlag=0, ErrorFlag=0, EmptyFlag=0; if( !( inputAd=new ClassAd(fp,"***", EndFlag, ErrorFlag, EmptyFlag) ) ){ fprintf( stderr, "ERROR: Out of memory\n" ); DC_Exit( 1 ); } fclose(fp); if ( ErrorFlag || EmptyFlag ) { fprintf( stderr, "ERROR - file %s does not contain a parseable ClassAd\n", testfile); DC_Exit(1); } // since this option is for testing, process then exit ClassAd * resultAd = process_request(inputAd); dPrintAd(D_ALWAYS, *resultAd); DC_Exit( 0 ); } }
void ExitSuccess() { print_status(); dagman.dag->DumpNodeStatus( false, false ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_OKAY ); tolerant_unlink( lockFileName ); dagman.CleanUp(); DC_Exit( EXIT_OKAY ); }
void ExitSuccess() { dagman.dag->DumpNodeStatus( false, false ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_OKAY ); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); DC_Exit( EXIT_OKAY ); }
void usage( char *name ) { dprintf( D_ALWAYS, "Usage: %s [-f] [-b] [-t] [-p <port>] [-s <schedd addr>] [-o <owern@uid-domain>] [-C <job constraint>] [-S <scratch dir>] [-A <aux id>]\n", condor_basename( name ) ); DC_Exit( 1 ); }
void main_shutdown_fast() { #ifndef WIN32 if (io_loop_pid != -1) kill(io_loop_pid, SIGKILL); #endif DC_Exit(0); }
void main_shutdown_graceful() { #ifndef WIN32 if (io_loop_pid != -1) kill(io_loop_pid, SIGTERM); #endif DC_Exit(0); }
void usage( char* MyName) { fprintf( stderr, "Usage: %s [option]\n", MyName ); fprintf( stderr, " where [option] is one of:\n" ); fprintf( stderr, " [-skip-benchmarks]\t(now a no-op)\n" ); DC_Exit( 1 ); }
void BaseShadow::holdJobAndExit( const char* reason, int hold_reason_code, int hold_reason_subcode ) { m_force_fast_starter_shutdown = true; holdJob(reason,hold_reason_code,hold_reason_subcode); // finally, exit and tell the schedd what to do DC_Exit( JOB_SHOULD_HOLD ); }
void BaseShadow::reconnectFailed( const char* reason ) { // try one last time to release the claim, write a UserLog event // about it, and exit with a special status. dprintf( D_ALWAYS, "Reconnect FAILED: %s\n", reason ); logReconnectFailedEvent( reason ); // does not return DC_Exit( JOB_SHOULD_REQUEUE ); }
void usage(void) { dprintf(D_ALWAYS, "Usage info:\n" "--schedd <sinful>: Address of the schedd the transferd will contact\n" "--stdin: Accept a transfer request on stdin\n" "--id <ascii>: Used by the schedd to pair transferds to requests\n" "--shadow <upload|download>:\n" " Used with --stdin, transferd connects to shadow.\n" " This is demo mode with the starter.\n"); DC_Exit(0); }
void BaseShadow::evictJob( int reason ) { MyString from_where; MyString machine; if( getMachineName(machine) ) { from_where.formatstr(" from %s",machine.Value()); } dprintf( D_ALWAYS, "Job %d.%d is being evicted%s\n", getCluster(), getProc(), from_where.Value() ); if( ! jobAd ) { dprintf( D_ALWAYS, "In evictJob() w/ NULL JobAd!" ); DC_Exit( reason ); } // cleanup this shadow (kill starters, etc) cleanUp( jobWantsGracefulRemoval() ); // write stuff to user log: logEvictEvent( reason ); // record the time we were vacated into the job ad char buf[64]; sprintf( buf, "%s = %d", ATTR_LAST_VACATE_TIME, (int)time(0) ); jobAd->Insert( buf ); // update the job ad in the queue with some important final // attributes so we know what happened to the job when using // condor_history... if( !updateJobInQueue(U_EVICT) ) { // trouble! TODO: should we do anything else? dprintf( D_ALWAYS, "Failed to update job queue!\n" ); } // does not return. DC_Exit( reason ); }
static void PREFAST_NORETURN usage() { dprintf(D_ALWAYS, "argc = %d\n", my_argc); for( int i=0; i < my_argc; i++ ) { dprintf( D_ALWAYS, "argv[%d] = %s\n", i, my_argv[i] ); } dprintf(D_ALWAYS, "usage: condor_starter initiating_host\n"); dprintf(D_ALWAYS, " or: condor_starter -job-keyword keyword\n"); dprintf(D_ALWAYS, " -job-input-ad path\n"); dprintf(D_ALWAYS, " -job-cluster number\n"); dprintf(D_ALWAYS, " -job-proc number\n"); dprintf(D_ALWAYS, " -job-subproc number\n"); DC_Exit(1); }
void main_pre_dc_init( int argc, char* argv[] ) { // handle -o, so that we can switch euid to the user before // daemoncore does most of its initialization work. int i = 1; while ( i < argc ) { if ( !strcmp( argv[i], "-o" ) ) { // Say what user we're running jobs on behave of. // If the schedd starts us as root, we need to switch to // this uid for most of our life. if ( argc <= i + 1 ) { usage( argv[0] ); } myUserName = strdup( argv[i + 1] ); break; } i++; } if ( myUserName ) { char *owner = strdup( myUserName ); char *domain = strchr( owner, '@' ); if ( domain ) { *domain = '\0'; domain = domain + 1; } if ( !init_user_ids(owner, domain)) { dprintf(D_ALWAYS, "init_user_ids() failed!\n"); // uids.C will EXCEPT when we set_user_priv() now // so there's not much we can do at this point } set_user_priv(); // We can't call daemonCore->Register_Priv_State() here because // there's no daemonCore object yet. We'll call it in main_init(). free( myUserName ); myUserName = owner; } else if ( is_root() ) { dprintf( D_ALWAYS, "Don't know what user to run as!\n" ); DC_Exit( 1 ); } else { myUserName = my_username(); } }
void BaseShadow::mockTerminateJob( MyString exit_reason, bool exited_by_signal, int exit_code, int exit_signal, bool core_dumped ) { if (exit_reason == "") { exit_reason = "Exited normally"; } dprintf( D_ALWAYS, "Mock terminating job %d.%d: " "exited_by_signal=%s, exit_code=%d OR exit_signal=%d, " "core_dumped=%s, exit_reason=\"%s\"\n", getCluster(), getProc(), exited_by_signal ? "TRUE" : "FALSE", exit_code, exit_signal, core_dumped ? "TRUE" : "FALSE", exit_reason.Value()); if( ! jobAd ) { dprintf(D_ALWAYS, "BaseShadow::mockTerminateJob(): NULL JobAd! " "Holding Job!"); DC_Exit( JOB_SHOULD_HOLD ); } // Insert the various exit attributes into our job ad. jobAd->Assign( ATTR_JOB_CORE_DUMPED, core_dumped ); jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, exited_by_signal ); if (exited_by_signal) { jobAd->Assign( ATTR_ON_EXIT_SIGNAL, exit_signal ); } else { jobAd->Assign( ATTR_ON_EXIT_CODE, exit_code ); } jobAd->Assign( ATTR_EXIT_REASON, exit_reason ); // update the job queue for the attributes we care about if( !updateJobInQueue(U_TERMINATE) ) { // trouble! TODO: should we do anything else? dprintf( D_ALWAYS, "Failed to update job queue!\n" ); } }