void VMUniverseMgr::printVMGahpInfo( int debug_level ) { dprintf( debug_level, "........VMGAHP info........\n"); dPrintAd(debug_level, m_vmgahp_info); dprintf( debug_level, "\n"); }
void JobInfoCommunicator::checkForStarterDebugging( void ) { if( ! job_ad ) { EXCEPT( "checkForStarterDebugging() called with no job ad!" ); } // For debugging, see if there's a special attribute in the // job ad that sends us into an infinite loop, waiting for // someone to attach with a debugger volatile int starter_should_wait = 0; int tmp = 0; // Can't pass volatile int into LookupInteger job_ad->LookupInteger( ATTR_STARTER_WAIT_FOR_DEBUG, tmp ); starter_should_wait = tmp; if( starter_should_wait ) { dprintf( D_ALWAYS, "Job requested starter should wait for " "debugger with %s=%d, going into infinite loop\n", ATTR_STARTER_WAIT_FOR_DEBUG, starter_should_wait ); while( 1 ) { if ( !starter_should_wait ) { break; } } } // Also, if the starter has D_JOB turned on, we want to dump // out the job ad to the log file... if( IsDebugLevel( D_JOB ) ) { dprintf( D_JOB, "*** Job ClassAd ***\n" ); dPrintAd( D_JOB, *job_ad ); dprintf( D_JOB, "--- End of ClassAd ---\n" ); } }
/* Continue reading from stdin the rest of the protocol for this encapsulation method */ int TransferD::accept_transfer_request_encapsulation_old_classads(FILE *fin) { int i; int eof, error, empty; const char *classad_delimitor = "---\n"; ClassAd *ad; TransferRequest *treq = NULL; MyString cap; /* read the transfer request header packet upon construction */ ad = new ClassAd(fin, classad_delimitor, eof, error, empty); if (empty == TRUE) { EXCEPT("Protocol faliure, can't read initial Info Packet"); } // initialize the header information of the TransferRequest object. treq = new TransferRequest(ad); if (treq == NULL) { EXCEPT("Out of memory!"); } treq->dprintf(D_ALWAYS); /* read the information packet which describes the rest of the protocol */ if (treq->get_num_transfers() <= 0) { EXCEPT("Protocol error!"); } // read all the work ads associated with this TransferRequest for (i = 0; i < treq->get_num_transfers(); i++) { ad = new ClassAd(fin, classad_delimitor, eof, error, empty); if (empty == TRUE) { EXCEPT("Expected %d transfer job ads, got %d instead.", treq->get_num_transfers(), i); } dPrintAd(D_ALWAYS, *ad); treq->append_task(ad); } // Since stdin may only provide one transfer request currently, make up // a capability and shove it into the work hash cap = gen_capability(); // record that I've accepted it. m_treqs.insert(cap, treq); // mark it down that we are no longer need an inactivity timer m_inactivity_timer = 0; return TRUE; }
void Shadow::printInfo( int debug_level ) { dprintf( debug_level, "Info for \"%s\":\n", s_path ); dprintf( debug_level | D_NOHEADER, "IsDaemonCore: %s\n", s_is_dc ? "True" : "False" ); if( ! s_ad ) { dprintf( debug_level | D_NOHEADER, "No ClassAd available!\n" ); } else { dPrintAd( debug_level, *s_ad ); } dprintf( debug_level | D_NOHEADER, "*** End of shadow info ***\n" ); }
void main_init(int argc , char * argv []) { char *testfile = NULL; ClassAd *inputAd = NULL; int i; dprintf(D_ALWAYS, "main_init() called\n"); for (i=1; i<argc; i++ ) { if (match_prefix(argv[i],"-withfile")) { i++; if (argc <= i) { fprintf(stderr, "ERROR: Argument -withfile requires a parameter\n "); exit(1); } testfile = argv[i]; } } // end of parsing command line options if ( testfile ) { FILE* fp = safe_fopen_wrapper(testfile,"r"); if (!fp) { fprintf(stderr,"ERROR: Unable to open test file %s\n", testfile); DC_Exit(1); } int EndFlag=0, ErrorFlag=0, EmptyFlag=0; if( !( inputAd=new ClassAd(fp,"***", EndFlag, ErrorFlag, EmptyFlag) ) ){ fprintf( stderr, "ERROR: Out of memory\n" ); DC_Exit( 1 ); } fclose(fp); if ( ErrorFlag || EmptyFlag ) { fprintf( stderr, "ERROR - file %s does not contain a parseable ClassAd\n", testfile); DC_Exit(1); } // since this option is for testing, process then exit ClassAd * resultAd = process_request(inputAd); dPrintAd(D_ALWAYS, *resultAd); DC_Exit( 0 ); } }
void dumpClassad( const char* header, ClassAd* ad, int debug_flag ) { if( ! header ) { dprintf( D_ALWAYS, "ERROR: called dumpClassad() w/ NULL header\n" ); return; } if( ! ad ) { dprintf( D_ALWAYS, "ERROR: called dumpClassad(\"%s\") w/ NULL ad\n", header ); return; } if( IsDebugCatAndVerbosity(debug_flag) ) { dprintf( debug_flag, "*** ClassAd Dump: %s ***\n", header ); dPrintAd( debug_flag, *ad ); dprintf( debug_flag, "--- End of ClassAd ---\n" ); } }
int Starter::receiveJobClassAdUpdate( Stream *stream ) { ClassAd update_ad; int final_update = 0; // It is expected that we will get here when the stream is closed. // Unfortunately, log noise will be generated when we try to read // from it. stream->decode(); stream->timeout(10); if( !stream->get( final_update) || !getClassAd( stream, update_ad ) || !stream->end_of_message() ) { final_update = 1; } else { dprintf(D_FULLDEBUG, "Received job ClassAd update from starter.\n"); dPrintAd( D_JOB, update_ad ); // In addition to new info about the job, the starter also // inserts contact info for itself (important for CCB and // shadow-starter reconnect, because startd needs to relay // starter's full contact info to the shadow when queried). // It's a bit of a hack to do it through this channel, but // better than nothing. update_ad.LookupString(ATTR_STARTER_IP_ADDR,m_starter_addr); if( s_claim ) { s_claim->receiveJobClassAdUpdate( update_ad ); } } if( final_update ) { dprintf(D_FULLDEBUG, "Closing job ClassAd update socket from starter.\n"); daemonCore->Cancel_Socket(s_job_update_sock); delete s_job_update_sock; s_job_update_sock = NULL; } return KEEP_STREAM; }
void doContactSchedd() { int rc; Qmgr_connection *schedd; BaseJob *curr_job; ClassAd *next_ad; char expr_buf[12000]; bool schedd_updates_complete = false; bool schedd_deletes_complete = false; bool add_remove_jobs_complete = false; bool update_jobs_complete = false; bool commit_transaction = true; int failure_line_num = 0; bool send_reschedule = false; std::string error_str = ""; StringList dirty_job_ids; char *job_id_str; PROC_ID job_id; CondorError errstack; dprintf(D_FULLDEBUG,"in doContactSchedd()\n"); initJobExprs(); contactScheddTid = TIMER_UNSET; // vacateJobs ///////////////////////////////////////////////////// if ( pendingScheddVacates.getNumElements() != 0 ) { std::string buff; StringList job_ids; VacateRequest curr_request; int result; ClassAd* rval; pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "%d.%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); job_ids.append( buff.c_str() ); } char *tmp = job_ids.print_to_string(); if ( tmp ) { dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp ); free(tmp); tmp = NULL; } rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack ); if ( rval == NULL ) { formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!", errstack.getFullText().c_str() ); goto contact_schedd_failure; } else { pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); if ( !rval->LookupInteger( buff.c_str(), result ) ) { dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" ); EXCEPT( "vacateJobs returned malformed ad" ); } else { dprintf( D_FULLDEBUG, " %d.%d vacate result: %d\n", curr_request.job->procID.cluster, curr_request.job->procID.proc,result); pendingScheddVacates.remove( curr_request.job->procID ); curr_request.result = (action_result_t)result; curr_request.job->SetEvaluateState(); completedScheddVacates.insert( curr_request.job->procID, curr_request ); } } delete rval; } } schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() ); if ( !schedd ) { error_str = "Failed to connect to schedd!"; goto contact_schedd_failure; } // CheckLeases ///////////////////////////////////////////////////// if ( checkLeasesSignaled ) { dprintf( D_FULLDEBUG, "querying for renewed leases\n" ); // Grab the lease attributes of all the jobs in our global hashtable. BaseJob::JobsByProcId.startIterations(); while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) { int new_expiration; rc = GetAttributeInt( curr_job->procID.cluster, curr_job->procID.proc, ATTR_TIMER_REMOVE_CHECK, &new_expiration ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // This job doesn't have doesn't have a lease from // the submitter. Skip it. continue; } } curr_job->UpdateJobLeaseReceived( new_expiration ); } checkLeasesSignaled = false; } // end of handling check leases // AddJobs ///////////////////////////////////////////////////// if ( addJobsSignaled || firstScheddContact ) { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for new jobs\n" ); // Make sure we grab all Globus Universe jobs (except held ones // that we previously indicated we were done with) // when we first start up in case we're recovering from a // shutdown/meltdown. // Otherwise, grab all jobs that are unheld and aren't marked as // currently being managed and aren't marked as not matched. // If JobManaged is undefined, equate it with false. // If Matched is undefined, equate it with true. // NOTE: Schedds from Condor 6.6 and earlier don't include // "(Universe==9)" in the constraint they give to the gridmanager, // so this gridmanager will pull down non-globus-universe ads, // although it won't use them. This is inefficient but not // incorrect behavior. if ( firstScheddContact ) { // Grab all jobs for us to manage. This expression is a // derivative of the expression below for new jobs. We add // "|| Managed =?= TRUE" to also get jobs our previous // incarnation was in the middle of managing when it died // (if it died unexpectedly). With the new term, the // "&& Managed =!= TRUE" from the new jobs expression becomes // superfluous (by boolean logic), so we drop it. sprintf( expr_buf, "%s && %s && ((%s && %s) || %s)", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); } else { // Grab new jobs for us to manage sprintf( expr_buf, "%s && %s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_not_managed.c_str() ); } dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *old_job; int job_is_matched = 1; // default to true if not in ClassAd next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); bool job_is_managed = jobExternallyManaged(next_ad); next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched); if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) { JobType *job_type = NULL; BaseJob *new_job = NULL; // job had better be either managed or matched! (or both) ASSERT( job_is_managed || job_is_matched ); if ( MustExpandJobAd( next_ad ) ) { // Get the expanded ClassAd from the schedd, which // has the GridResource filled in with info from // the matched ad. delete next_ad; next_ad = NULL; next_ad = GetJobAd(procID.cluster,procID.proc); if ( next_ad == NULL && errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } if ( next_ad == NULL ) { // We may get here if it was not possible to expand // one of the $$() expressions. We don't want to // roll back the transaction and blow away the // hold that the schedd just put on the job, so // simply skip over this ad. dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d. errno=%d\n",procID.cluster,procID.proc,errno); goto contact_schedd_next_add_job; } } // Search our job types for one that'll handle this job jobTypes.Rewind(); while ( jobTypes.Next( job_type ) ) { if ( job_type->AdMatchFunc( next_ad ) ) { // Found one! dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n", job_type->Name, procID.cluster, procID.proc ); break; } } if ( job_type != NULL ) { new_job = job_type->CreateFunc( next_ad ); } else { dprintf( D_ALWAYS, "No handlers for job %d.%d\n", procID.cluster, procID.proc ); new_job = new BaseJob( next_ad ); } ASSERT(new_job); new_job->SetEvaluateState(); dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n", new_job->procID.cluster,new_job->procID.proc); num_ads++; if ( !job_is_managed ) { rc = tSetAttributeString( new_job->procID.cluster, new_job->procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } else { // We already know about this job, skip // But also set Managed=true on the schedd so that it won't // keep signalling us about it delete next_ad; rc = tSetAttributeString( procID.cluster, procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } contact_schedd_next_add_job: next_ad = GetNextJobByConstraint( expr_buf, 0 ); } // end of while next_ad if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads); } // end of handling add jobs // RemoveJobs ///////////////////////////////////////////////////// // We always want to perform this check. Otherwise, we may overwrite a // REMOVED/HELD/COMPLETED status with something else below. { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" ); // Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we // haven't previously indicated that we're done with (by setting // JobManaged to "Schedd". sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))", ScheddJobConstraint, expr_not_completely_done.c_str(), ATTR_JOB_STATUS, REMOVED, ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *next_job; int curr_status; next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status ); if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) { // Should probably skip jobs we already have marked as // held or removed next_job->JobAdUpdateFromSchedd( next_ad, true ); num_ads++; } else if ( curr_status == REMOVED ) { // If we don't know about the job, act like we got an // ADD_JOBS signal from the schedd the next time we // connect, so that we'll create a Job object for it // and decide how it needs to be handled. // TODO The AddJobs and RemoveJobs queries shoule be // combined into a single query. dprintf( D_ALWAYS, "Don't know about removed job %d.%d. " "Will treat it as a new job to manage\n", procID.cluster, procID.proc ); addJobsSignaled = true; } else { dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. " "Ignoring it\n", procID.cluster, procID.proc ); } delete next_ad; next_ad = GetNextJobByConstraint( expr_buf, 0 ); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads); } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } add_remove_jobs_complete = true; // Retrieve dirty attributes ///////////////////////////////////////////////////// if ( updateJobsSignaled ) { dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" ); sprintf( expr_buf, "%s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { ClassAd updates; char str[PROC_ID_STR_BUFLEN]; next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc ); if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) { dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc ); failure_line_num = __LINE__; delete next_ad; goto contact_schedd_disconnect; } else { dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc); dPrintAd(D_JOB, updates); } if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->JobAdUpdateFromSchedd( &updates, false ); ProcIdToStr( job_id, str ); dirty_job_ids.append( str ); } else { dprintf( D_ALWAYS, "Don't know about updated job %d.%d. " "Ignoring it\n", job_id.cluster, job_id.proc ); } delete next_ad; next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 ); } } update_jobs_complete = true; // if ( BeginTransaction() < 0 ) { errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } // requestJobStatus ///////////////////////////////////////////////////// if ( pendingJobStatus.getNumElements() != 0 ) { JobStatusRequest curr_request; pendingJobStatus.startIterations(); while ( pendingJobStatus.iterate( curr_request ) != 0 ) { int status; rc = GetAttributeInt( curr_request.job_id.cluster, curr_request.job_id.proc, ATTR_JOB_STATUS, &status ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so return a job status of REMOVED. status = REMOVED; } } // return status dprintf( D_FULLDEBUG, "%d.%d job status: %d\n", curr_request.job_id.cluster, curr_request.job_id.proc, status ); pendingJobStatus.remove( curr_request.job_id ); curr_request.job_status = status; daemonCore->Reset_Timer( curr_request.tid, 0 ); completedJobStatus.insert( curr_request.job_id, curr_request ); } } // Update existing jobs ///////////////////////////////////////////////////// ScheddUpdateRequest *curr_request; pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n", curr_job->procID.cluster, curr_job->procID.proc); const char *attr_name; const char *attr_value; ExprTree *expr; bool fake_job_in_queue = false; curr_job->jobAd->ResetExpr(); while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true && fake_job_in_queue == false ) { attr_value = ExprTreeToString( expr ); dprintf(D_FULLDEBUG," %s = %s\n",attr_name,attr_value); rc = SetAttribute( curr_job->procID.cluster, curr_job->procID.proc, attr_name, attr_value); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so pretend that all updates for the job succeed. // Otherwise, we'll never make forward progress on // the job. // TODO We should also fake a job status of REMOVED // to the job, so it can do what cleanup it can. fake_job_in_queue = true; break; } } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_updates_complete = true; // Delete existing jobs ///////////////////////////////////////////////////// errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; if ( curr_job->deleteFromSchedd ) { dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n", curr_job->procID.cluster, curr_job->procID.proc); rc = DestroyProc(curr_job->procID.cluster, curr_job->procID.proc); // NOENT means the job doesn't exist. Good enough for us. if ( rc < 0 && rc != DESTROYPROC_ENOENT) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_deletes_complete = true; contact_schedd_disconnect: DisconnectQ( schedd, commit_transaction ); if ( add_remove_jobs_complete == true ) { firstScheddContact = false; addJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( update_jobs_complete == true ) { updateJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( schedd_updates_complete == false ) { formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num ); goto contact_schedd_failure; } // Clear dirty bits for all jobs updated if ( !dirty_job_ids.isEmpty() ) { ClassAd *rval; dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n", dirty_job_ids.number() ); dirty_job_ids.rewind(); rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack ); if ( rval == NULL ) { dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes. CondorError: %s\n", errstack.getFullText().c_str() ); } delete rval; } // Wake up jobs that had schedd updates pending and delete job // objects that wanted to be deleted pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; curr_job->jobAd->ClearAllDirtyFlags(); if ( curr_job->deleteFromGridmanager ) { // If the Job object wants to delete the job from the // schedd but we failed to do so, don't delete the job // object yet; wait until we successfully delete the job // from the schedd. if ( curr_job->deleteFromSchedd == true && schedd_deletes_complete == false ) { continue; } // If wantRematch is set, send a reschedule now if ( curr_job->wantRematch ) { send_reschedule = true; } pendingScheddUpdates.remove( curr_job->procID ); pendingScheddVacates.remove( curr_job->procID ); pendingJobStatus.remove( curr_job->procID ); completedJobStatus.remove( curr_job->procID ); completedScheddVacates.remove( curr_job->procID ); delete curr_job; } else { pendingScheddUpdates.remove( curr_job->procID ); if ( curr_request->m_notify ) { curr_job->SetEvaluateState(); } } delete curr_request; } // Poke objects that wanted to be notified when a schedd update completed // successfully (possibly minus deletes) int timer_id; scheddUpdateNotifications.Rewind(); while ( scheddUpdateNotifications.Next( timer_id ) ) { daemonCore->Reset_Timer( timer_id, 0 ); } scheddUpdateNotifications.Clear(); if ( send_reschedule == true ) { ScheddObj->reschedule(); } // Check if we have any jobs left to manage. If not, exit. if ( BaseJob::JobsByProcId.getNumElements() == 0 ) { dprintf( D_ALWAYS, "No jobs left, shutting down\n" ); daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM ); } lastContactSchedd = time(NULL); if ( schedd_deletes_complete == false ) { error_str = "Problem using DestroyProc to delete jobs!"; goto contact_schedd_failure; } scheddFailureCount = 0; // For each job that had dirty attributes, re-evaluate the policy dirty_job_ids.rewind(); while ( (job_id_str = dirty_job_ids.next()) != NULL ) { StrToProcIdFixMe(job_id_str, job_id); if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->EvalPeriodicJobExpr(); } } dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n"); return; contact_schedd_failure: scheddFailureCount++; if ( error_str == "" ) { error_str = "Failure in doContactSchedd"; } if ( scheddFailureCount >= maxScheddFailures ) { dprintf( D_ALWAYS, "%s\n", error_str.c_str() ); EXCEPT( "Too many failures connecting to schedd!" ); } dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() ); lastContactSchedd = time(NULL); RequestContactSchedd(); return; }
bool DCStarter::peek(bool transfer_stdout, ssize_t &stdout_offset, bool transfer_stderr, ssize_t &stderr_offset, const std::vector<std::string> &filenames, std::vector<ssize_t> &offsets, size_t max_bytes, bool &retry_sensible, PeekGetFD &next, std::string &error_msg, unsigned timeout, const std::string &sec_session_id, DCTransferQueue *xfer_q) { compat_classad::ClassAd ad; ad.InsertAttr(ATTR_JOB_OUTPUT, transfer_stdout); ad.InsertAttr("OutOffset", stdout_offset); ad.InsertAttr(ATTR_JOB_ERROR, transfer_stderr); ad.InsertAttr("ErrOffset", stderr_offset); ad.InsertAttr(ATTR_VERSION, CondorVersion()); size_t total_files = 0; total_files += transfer_stdout ? 1 : 0; total_files += transfer_stderr ? 1 : 0; if (filenames.size()) { total_files += filenames.size(); std::vector<classad::ExprTree *> filelist; filelist.reserve(filenames.size()); std::vector<classad::ExprTree *> offsetlist; offsetlist.reserve(filenames.size()); std::vector<ssize_t>::const_iterator it2 = offsets.begin(); for (std::vector<std::string>::const_iterator it = filenames.begin(); it != filenames.end() && it2 != offsets.end(); it++, it2++) { classad::Value value; value.SetStringValue(*it); filelist.push_back(classad::Literal::MakeLiteral(value)); value.SetIntegerValue(*it2); offsetlist.push_back(classad::Literal::MakeLiteral(value)); } classad::ExprTree *list(classad::ExprList::MakeExprList(filelist)); ad.Insert("TransferFiles", list); list = classad::ExprList::MakeExprList(offsetlist); ad.Insert("TransferOffsets", list); } ad.InsertAttr(ATTR_MAX_TRANSFER_BYTES, static_cast<long long>(max_bytes)); ReliSock sock; if( !connectSock(&sock, timeout, NULL) ) { error_msg = "Failed to connect to starter"; return false; } if( !startCommand(STARTER_PEEK, &sock, timeout, NULL, NULL, false, sec_session_id.c_str()) ) { error_msg = "Failed to send START_PEEK to starter"; return false; } sock.encode(); if (!putClassAd(&sock, ad) || !sock.end_of_message()) { error_msg = "Failed to send request to starter"; return false; } compat_classad::ClassAd response; sock.decode(); if (!getClassAd(&sock, response) || !sock.end_of_message()) { error_msg = "Failed to read response for peeking at logs."; return false; } dPrintAd(D_FULLDEBUG, response); bool success = false; if (!response.EvaluateAttrBool(ATTR_RESULT, success) || !success) { response.EvaluateAttrBool(ATTR_RETRY, retry_sensible); error_msg = "Remote operation failed."; response.EvaluateAttrString(ATTR_ERROR_STRING, error_msg); return false; } classad::Value valueX; classad_shared_ptr<classad::ExprList> list; if (!response.EvaluateAttr("TransferFiles", valueX) || !valueX.IsSListValue(list)) { error_msg = "Unable to evaluate starter response"; return false; } classad_shared_ptr<classad::ExprList> offlist; if (!response.EvaluateAttr("TransferOffsets", valueX) || !valueX.IsSListValue(offlist)) { error_msg = "Unable to evaluate starter response (missing offsets)"; return false; } size_t remaining = max_bytes; size_t file_count = 0; classad::ExprList::const_iterator it2 = offlist->begin(); for (classad::ExprList::const_iterator it = list->begin(); it != list->end() && it2 != offlist->end(); it++, it2++) { classad::Value value; (*it2)->Evaluate(value); off_t off = -1; value.IsIntegerValue(off); (*it)->Evaluate(value); std::string filename; int64_t xfer_fd = -1; if (!value.IsStringValue(filename) && value.IsIntegerValue(xfer_fd)) { if (xfer_fd == 0) filename = "_condor_stdout"; if (xfer_fd == 1) filename = "_condor_stderr"; } int fd = next.getNextFD(filename); filesize_t size = -1; int retval; if ((retval = sock.get_file(&size, fd, false, false, remaining, xfer_q)) && (retval != GET_FILE_MAX_BYTES_EXCEEDED)) { error_msg = "Internal error when transferring file " + filename; } else if (size >= 0) { remaining -= max_bytes; file_count++; off += size; } else { error_msg = "Failed to transfer file " + filename; } if (xfer_fd == 0) { stdout_offset = off; //dprintf(D_FULLDEBUG, "New stdout offset: %ld\n", stdout_offset); } else if (xfer_fd == 1) { stderr_offset = off; } else { std::vector<ssize_t>::iterator it4 = offsets.begin(); for (std::vector<std::string>::const_iterator it3 = filenames.begin(); it3 != filenames.end() && it4 != offsets.end(); it3++, it4++) { if (*it3 == filename) *it4 = off; } } } size_t remote_file_count; if (!sock.get(remote_file_count) || !sock.end_of_message()) { error_msg = "Unable to get remote file count."; return false; } if (file_count != remote_file_count) { std::stringstream ss; ss << "Recieved " << file_count << " files, but remote side thought it sent " << remote_file_count << " files"; error_msg = ss.str(); return false; } if ((total_files != file_count) && !error_msg.size()) { error_msg = "At least one file transfer failed."; return false; } return true; }
void XferSummary::time_out(time_t now, char *hostaddr) { ClassAd info; char line[128], *tmp; char *str = NULL; SetMyTypeName(info, "CkptServer"); SetTargetTypeName(info, "CkptFile"); sprintf(line, "%s = \"%s\"", ATTR_NAME, get_local_fqdn().Value()); info.Insert(line); sprintf(line, "%s = \"%s\"", ATTR_MACHINE, hostaddr ); info.Insert(line); sprintf(line, "%s = \"%s\"", ATTR_VERSION, CondorVersion() ); info.Insert(line); sprintf(line, "%s = \"%s\"", ATTR_PLATFORM, CondorPlatform() ); info.Insert(line); sprintf(line, "NumSends = %d", num_sends); info.Insert(line); sprintf(line, "BytesSent = %d", (int) bytes_sent); info.Insert(line); sprintf(line, "TimeSending = %d", time_sending); info.Insert(line); sprintf(line, "AvgSendBandwidth = %f", num_sends ? tot_send_bandwidth / num_sends : 0.0); info.Insert(line); sprintf(line, "NumRecvs = %d", num_recvs); info.Insert(line); sprintf(line, "BytesReceived = %d", (int) bytes_recv); info.Insert(line); sprintf(line, "TimeReceiving = %d", time_recving); info.Insert(line); sprintf(line, "AvgReceiveBandwidth = %f", num_recvs ? tot_recv_bandwidth / num_recvs : 0.0); info.Insert(line); /* ctime adds a newline at the end of the ascii conversion.... */ str = ctime(&start_time); sprintf(line, "CkptServerIntervalStart = \"%s\"", str ? str : "Unknown\n"); tmp = strchr( line, '\n' ); if (tmp != NULL) { /* delete the newline */ *tmp = '\"'; tmp++; *tmp = '\0'; } info.Insert(line); /* ctime adds a newline at the end of the ascii conversion.... */ str = ctime(&now); sprintf(line, "CkptServerIntervalEnd = \"%s\"", str ? str : "Unknown\n"); tmp = strchr( line, '\n' ); if (tmp != NULL) { /* delete the newline */ *tmp = '\"'; tmp++; *tmp = '\0'; } info.Insert(line); info.Assign("Disk", sysapi_disk_space(pwd.Value())); // Send to collector if ( Collectors ) { dprintf(D_NETWORK, "Sending CkptServer ClassAd:\n"); dPrintAd(D_NETWORK, info); Collectors->sendUpdates (UPDATE_CKPT_SRVR_AD, &info, NULL, true); } init(); }
ClassAd* readJobAd( void ) { ClassAd* ad = NULL; bool is_stdin = false; bool read_something = false; ASSERT( job_ad_file ); if( job_ad_file[0] == '-' && job_ad_file[1] == '\0' ) { fp = stdin; is_stdin = true; } else { if (fp == NULL) { fp = safe_fopen_wrapper_follow( job_ad_file, "r" ); if( ! fp ) { EXCEPT( "Failed to open ClassAd file (%s): %s (errno %d)", job_ad_file, strerror(errno), errno ); } } } dprintf( D_FULLDEBUG, "Reading job ClassAd from %s\n", is_stdin ? "STDIN" : job_ad_file ); ad = new ClassAd; MyString line; while( line.readLine(fp) ) { read_something = true; line.chomp(); if( line[0] == '#' ) { dprintf( D_JOB, "IGNORING COMMENT: %s\n", line.Value() ); continue; } if( line == "***" ) { dprintf( D_JOB, "Saw ClassAd delimitor, stopping\n" ); break; } if( ! ad->Insert(line.Value()) ) { EXCEPT( "Failed to insert \"%s\" into ClassAd!", line.Value() ); } } if( ! read_something ) { EXCEPT( "reading ClassAd from (%s): file is empty", is_stdin ? "STDIN" : job_ad_file ); } if( IsDebugVerbose(D_JOB) ) { dPrintAd( D_JOB, *ad ); } // For debugging, see if there's a special attribute in the // job ad that sends us into an infinite loop, waiting for // someone to attach with a debugger int shadow_should_wait = 0; ad->LookupInteger( ATTR_SHADOW_WAIT_FOR_DEBUG, shadow_should_wait ); if( shadow_should_wait ) { dprintf( D_ALWAYS, "Job requested shadow should wait for " "debugger with %s=%d, going into infinite loop\n", ATTR_SHADOW_WAIT_FOR_DEBUG, shadow_should_wait ); while( shadow_should_wait ) { } } return ad; }
bool CollectorEngine::ValidateClassAd(int command,ClassAd *clientAd,Sock *sock) { if( !m_collector_requirements ) { // no need to do any of the following checks if the admin has // not configured any COLLECTOR_REQUIREMENTS return true; } char const *ipattr = NULL; switch( command ) { case MERGE_STARTD_AD: case UPDATE_STARTD_AD: case UPDATE_STARTD_AD_WITH_ACK: ipattr = ATTR_STARTD_IP_ADDR; break; case UPDATE_SCHEDD_AD: case UPDATE_SUBMITTOR_AD: ipattr = ATTR_SCHEDD_IP_ADDR; break; case UPDATE_MASTER_AD: ipattr = ATTR_MASTER_IP_ADDR; break; case UPDATE_NEGOTIATOR_AD: ipattr = ATTR_NEGOTIATOR_IP_ADDR; break; case UPDATE_COLLECTOR_AD: ipattr = ATTR_COLLECTOR_IP_ADDR; break; case UPDATE_LICENSE_AD: case UPDATE_CKPT_SRVR_AD: case UPDATE_STORAGE_AD: case UPDATE_HAD_AD: case UPDATE_AD_GENERIC: case UPDATE_GRID_AD: case UPDATE_ACCOUNTING_AD: default: break; } if(ipattr) { MyString my_address; MyString subsys_ipaddr; // Some ClassAds contain two copies of the IP address, // one named "MyAddress" and one named "<SUBSYS>IpAddr". // If the latter exists, then it _must_ match the former, // because people may be filtering in COLLECTOR_REQUIREMENTS // on MyAddress, and we don't want them to have to worry // about filtering on the older cruftier <SUBSYS>IpAddr. if( clientAd->LookupString( ipattr, subsys_ipaddr ) ) { clientAd->LookupString( ATTR_MY_ADDRESS, my_address ); if( my_address != subsys_ipaddr ) { dprintf(D_ALWAYS, "%s VIOLATION: ClassAd from %s advertises inconsistent" " IP addresses: %s=%s, %s=%s\n", COLLECTOR_REQUIREMENTS, (sock ? sock->get_sinful_peer() : "(NULL)"), ipattr, subsys_ipaddr.Value(), ATTR_MY_ADDRESS, my_address.Value()); return false; } } } // Now verify COLLECTOR_REQUIREMENTS bool collector_req_result = false; if( !EvalBool(COLLECTOR_REQUIREMENTS,m_collector_requirements,clientAd,collector_req_result) ) { dprintf(D_ALWAYS,"WARNING: %s did not evaluate to a boolean result.\n",COLLECTOR_REQUIREMENTS); collector_req_result = false; } if( !collector_req_result ) { static int details_shown=0; bool show_details = (details_shown<10) || IsFulldebug(D_FULLDEBUG); dprintf(D_ALWAYS,"%s VIOLATION: requirements do not match ad from %s.%s\n", COLLECTOR_REQUIREMENTS, sock ? sock->get_sinful_peer() : "(null)", show_details ? " Contents of the ClassAd:" : " (turn on D_FULLDEBUG to see details)"); if( show_details ) { details_shown += 1; dPrintAd(D_ALWAYS, *clientAd); } return false; } return true; }
ClassAd * process_request(const ClassAd *inputAd) { static unsigned int req_number = 0; // Number each new request. ClassAd *resultAd = new ClassAd(); ASSERT(resultAd); resultAd->Assign("REQUEST_NUMBER",++req_number); dprintf(D_ALWAYS,"----------------------------------------\nProcessing request %d\n",req_number); dprintf(D_FULLDEBUG,"Contents of request classad:\n"); if ( inputAd ) { dPrintAd(D_FULLDEBUG, *(ClassAd*)inputAd); } // Create two temp dirs, one to serve as the iwd for the command, another // to hold the stdout/err. char *iwd = create_temp_file(true); char *stdio_iwd = create_temp_file(true); if (!iwd || !stdio_iwd) { handle_process_request_error("failed to create temp dirs",req_number,resultAd); return resultAd; } // Do the work. do_process_request(inputAd, resultAd, req_number, iwd, stdio_iwd); // Blow away our temp dirs unless we are in debug mode bool debug_mode = param_boolean("SOAPSHELL_DEBUG_MODE",false); if ( !debug_mode ) { if ( iwd ) { Directory dir(iwd); dir.Remove_Full_Path(iwd); free(iwd); } if ( stdio_iwd ) { Directory dir(stdio_iwd); dir.Remove_Full_Path(stdio_iwd); free(stdio_iwd); } } else { if ( iwd ) { dprintf(D_ALWAYS,"SOAPSHELL_DEBUG_MODE=True so not removing iwd %s\n", iwd); free(iwd); } if ( stdio_iwd ) { dprintf(D_ALWAYS,"SOAPSHELL_DEBUG_MODE=True so not removing stdio_iwd %s\n", stdio_iwd); free(stdio_iwd); } } dprintf(D_FULLDEBUG,"Contents of result classad:\n"); dPrintAd(D_FULLDEBUG, *resultAd); dprintf(D_ALWAYS,"Finished processing request %d\n",req_number); return resultAd; }
// fetch all ads from the collector that satisfy the constraints QueryResult CondorQuery:: fetchAds (ClassAdList &adList, const char *poolName, CondorError* errstack) { Sock* sock; int more; QueryResult result; ClassAd queryAd(extraAttrs), *ad; if ( !poolName ) { return Q_NO_COLLECTOR_HOST; } // contact collector Daemon my_collector( DT_COLLECTOR, poolName, NULL ); if( !my_collector.locate() ) { // We were passed a bogus poolName, abort gracefully return Q_NO_COLLECTOR_HOST; } // make the query ad result = getQueryAd (queryAd); if (result != Q_OK) return result; if( IsDebugLevel( D_HOSTNAME ) ) { dprintf( D_HOSTNAME, "Querying collector %s (%s) with classad:\n", my_collector.addr(), my_collector.fullHostname() ); dPrintAd( D_HOSTNAME, queryAd ); dprintf( D_HOSTNAME, " --- End of Query ClassAd ---\n" ); } int mytimeout = param_integer ("QUERY_TIMEOUT",60); if (!(sock = my_collector.startCommand(command, Stream::reli_sock, mytimeout, errstack)) || !putClassAd (sock, queryAd) || !sock->end_of_message()) { if (sock) { delete sock; } return Q_COMMUNICATION_ERROR; } // get result sock->decode (); more = 1; while (more) { if (!sock->code (more)) { sock->end_of_message(); delete sock; return Q_COMMUNICATION_ERROR; } if (more) { ad = new ClassAd; if( !getClassAd(sock, *ad) ) { sock->end_of_message(); delete ad; delete sock; return Q_COMMUNICATION_ERROR; } adList.Insert (ad); } } sock->end_of_message(); // finalize sock->close(); delete sock; return (Q_OK); }