Esempio n. 1
0
void
VMUniverseMgr::printVMGahpInfo( int debug_level )
{
	dprintf( debug_level, "........VMGAHP info........\n");
	dPrintAd(debug_level, m_vmgahp_info);
	dprintf( debug_level, "\n");
}
void
JobInfoCommunicator::checkForStarterDebugging( void )
{
	if( ! job_ad ) {
		EXCEPT( "checkForStarterDebugging() called with no job ad!" );
	}

		// For debugging, see if there's a special attribute in the
		// job ad that sends us into an infinite loop, waiting for
		// someone to attach with a debugger
	volatile int starter_should_wait = 0;
	int tmp = 0; // Can't pass volatile int into LookupInteger
	job_ad->LookupInteger( ATTR_STARTER_WAIT_FOR_DEBUG, tmp );
	starter_should_wait = tmp;
	if( starter_should_wait ) {
		dprintf( D_ALWAYS, "Job requested starter should wait for "
				 "debugger with %s=%d, going into infinite loop\n",
				 ATTR_STARTER_WAIT_FOR_DEBUG, starter_should_wait );
		while( 1 ) {
			if ( !starter_should_wait ) {
				break;
			}
		}
	}

		// Also, if the starter has D_JOB turned on, we want to dump
		// out the job ad to the log file...
	if( IsDebugLevel( D_JOB ) ) {
		dprintf( D_JOB, "*** Job ClassAd ***\n" );  
		dPrintAd( D_JOB, *job_ad );
        dprintf( D_JOB, "--- End of ClassAd ---\n" );
	}
}
Esempio n. 3
0
/* Continue reading from stdin the rest of the protocol for this encapsulation
	method */
int
TransferD::accept_transfer_request_encapsulation_old_classads(FILE *fin)
{
	int i;
	int eof, error, empty;
	const char *classad_delimitor = "---\n";
	ClassAd *ad;
	TransferRequest *treq = NULL;
	MyString cap;

	/* read the transfer request header packet upon construction */
	ad = new ClassAd(fin, classad_delimitor, eof, error, empty);
	if (empty == TRUE) {
		EXCEPT("Protocol faliure, can't read initial Info Packet");
	}

	// initialize the header information of the TransferRequest object.
	treq = new TransferRequest(ad);
	if (treq == NULL) {
		EXCEPT("Out of memory!");
	}

	treq->dprintf(D_ALWAYS);
	
	/* read the information packet which describes the rest of the protocol */
	if (treq->get_num_transfers() <= 0) {
		EXCEPT("Protocol error!");
	}

	// read all the work ads associated with this TransferRequest
	for (i = 0; i < treq->get_num_transfers(); i++) {
		ad = new ClassAd(fin, classad_delimitor, eof, error, empty);
		if (empty == TRUE) {
			EXCEPT("Expected %d transfer job ads, got %d instead.", 
				treq->get_num_transfers(), i);
		}
		dPrintAd(D_ALWAYS, *ad);
		treq->append_task(ad);
	}

	// Since stdin may only provide one transfer request currently, make up
	// a capability and shove it into the work hash
	cap = gen_capability();

	// record that I've accepted it.
	m_treqs.insert(cap, treq);

	// mark it down that we are no longer need an inactivity timer
	m_inactivity_timer = 0;

	return TRUE;
}
Esempio n. 4
0
void
Shadow::printInfo( int debug_level )
{
	dprintf( debug_level, "Info for \"%s\":\n", s_path );
	dprintf( debug_level | D_NOHEADER, "IsDaemonCore: %s\n", 
			 s_is_dc ? "True" : "False" );
	if( ! s_ad ) {
		dprintf( debug_level | D_NOHEADER, 
				 "No ClassAd available!\n" ); 
	} else {
		dPrintAd( debug_level, *s_ad );
	}
	dprintf( debug_level | D_NOHEADER, "*** End of shadow info ***\n" ); 
}
Esempio n. 5
0
void main_init(int  argc , char *  argv  [])
{
	char *testfile = NULL;
	ClassAd *inputAd = NULL;
	int i;

	dprintf(D_ALWAYS, "main_init() called\n");

	for (i=1; i<argc; i++ ) {
	
		if (match_prefix(argv[i],"-withfile")) {
			i++;
			if (argc <= i) {
				fprintf(stderr,
						"ERROR: Argument -withfile requires a parameter\n ");
				exit(1);
			}
			testfile = argv[i];
		}
	
	}	// end of parsing command line options

	if ( testfile ) {
		FILE* fp = safe_fopen_wrapper(testfile,"r");
		if (!fp) {
			fprintf(stderr,"ERROR: Unable to open test file %s\n",
					testfile);
			DC_Exit(1);
		}
		int EndFlag=0, ErrorFlag=0, EmptyFlag=0;
        if( !( inputAd=new ClassAd(fp,"***", EndFlag, ErrorFlag, EmptyFlag) ) ){
            fprintf( stderr, "ERROR:  Out of memory\n" );
            DC_Exit( 1 );
        }
		fclose(fp);
		if ( ErrorFlag || EmptyFlag ) {
			fprintf( stderr, "ERROR - file %s does not contain a parseable ClassAd\n",
					 testfile);
			DC_Exit(1);
		}
		// since this option is for testing, process then exit
		ClassAd * resultAd =  process_request(inputAd);
		dPrintAd(D_ALWAYS, *resultAd);
		DC_Exit( 0 );
	}
}
Esempio n. 6
0
void
dumpClassad( const char* header, ClassAd* ad, int debug_flag )
{
	if( ! header  ) {
		dprintf( D_ALWAYS, "ERROR: called dumpClassad() w/ NULL header\n" ); 
		return;
	}
	if( ! ad  ) {
		dprintf( D_ALWAYS, "ERROR: called dumpClassad(\"%s\") w/ NULL ad\n", 
				 header );   
		return;
	}
	if( IsDebugCatAndVerbosity(debug_flag) ) {
		dprintf( debug_flag, "*** ClassAd Dump: %s ***\n", header );  
		dPrintAd( debug_flag, *ad );
		dprintf( debug_flag, "--- End of ClassAd ---\n" );
	}
}
Esempio n. 7
0
int
Starter::receiveJobClassAdUpdate( Stream *stream )
{
	ClassAd update_ad;
	int final_update = 0;

		// It is expected that we will get here when the stream is closed.
		// Unfortunately, log noise will be generated when we try to read
		// from it.

	stream->decode();
	stream->timeout(10);
	if( !stream->get( final_update) ||
		!getClassAd( stream, update_ad ) ||
		!stream->end_of_message() )
	{
		final_update = 1;
	}
	else {
		dprintf(D_FULLDEBUG, "Received job ClassAd update from starter.\n");
		dPrintAd( D_JOB, update_ad );

		// In addition to new info about the job, the starter also
		// inserts contact info for itself (important for CCB and
		// shadow-starter reconnect, because startd needs to relay
		// starter's full contact info to the shadow when queried).
		// It's a bit of a hack to do it through this channel, but
		// better than nothing.
		update_ad.LookupString(ATTR_STARTER_IP_ADDR,m_starter_addr);

		if( s_claim ) {
			s_claim->receiveJobClassAdUpdate( update_ad );
		}
	}

	if( final_update ) {
		dprintf(D_FULLDEBUG, "Closing job ClassAd update socket from starter.\n");
		daemonCore->Cancel_Socket(s_job_update_sock);
		delete s_job_update_sock;
		s_job_update_sock = NULL;
	}
	return KEEP_STREAM;
}
Esempio n. 8
0
void
doContactSchedd()
{
	int rc;
	Qmgr_connection *schedd;
	BaseJob *curr_job;
	ClassAd *next_ad;
	char expr_buf[12000];
	bool schedd_updates_complete = false;
	bool schedd_deletes_complete = false;
	bool add_remove_jobs_complete = false;
	bool update_jobs_complete = false;
	bool commit_transaction = true;
	int failure_line_num = 0;
	bool send_reschedule = false;
	std::string error_str = "";
	StringList dirty_job_ids;
	char *job_id_str;
	PROC_ID job_id;
	CondorError errstack;

	dprintf(D_FULLDEBUG,"in doContactSchedd()\n");

	initJobExprs();

	contactScheddTid = TIMER_UNSET;

	// vacateJobs
	/////////////////////////////////////////////////////
	if ( pendingScheddVacates.getNumElements() != 0 ) {
		std::string buff;
		StringList job_ids;
		VacateRequest curr_request;

		int result;
		ClassAd* rval;

		pendingScheddVacates.startIterations();
		while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
			formatstr( buff, "%d.%d", curr_request.job->procID.cluster,
						  curr_request.job->procID.proc );
			job_ids.append( buff.c_str() );
		}

		char *tmp = job_ids.print_to_string();
		if ( tmp ) {
			dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp );
			free(tmp);
			tmp = NULL;
		}

		rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack );
		if ( rval == NULL ) {
			formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!",
							   errstack.getFullText().c_str() );
			goto contact_schedd_failure;
		} else {
			pendingScheddVacates.startIterations();
			while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
				formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster,
							  curr_request.job->procID.proc );
				if ( !rval->LookupInteger( buff.c_str(), result ) ) {
					dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" );
					EXCEPT( "vacateJobs returned malformed ad" );
				} else {
					dprintf( D_FULLDEBUG, "   %d.%d vacate result: %d\n",
							 curr_request.job->procID.cluster,
							 curr_request.job->procID.proc,result);
					pendingScheddVacates.remove( curr_request.job->procID );
					curr_request.result = (action_result_t)result;
					curr_request.job->SetEvaluateState();
					completedScheddVacates.insert( curr_request.job->procID,
												   curr_request );
				}
			}
			delete rval;
		}
	}


	schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() );
	if ( !schedd ) {
		error_str = "Failed to connect to schedd!";
		goto contact_schedd_failure;
	}


	// CheckLeases
	/////////////////////////////////////////////////////
	if ( checkLeasesSignaled ) {

		dprintf( D_FULLDEBUG, "querying for renewed leases\n" );

		// Grab the lease attributes of all the jobs in our global hashtable.

		BaseJob::JobsByProcId.startIterations();

		while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) {
			int new_expiration;

			rc = GetAttributeInt( curr_job->procID.cluster,
								  curr_job->procID.proc,
								  ATTR_TIMER_REMOVE_CHECK,
								  &new_expiration );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// This job doesn't have doesn't have a lease from
						// the submitter. Skip it.
					continue;
				}
			}
			curr_job->UpdateJobLeaseReceived( new_expiration );
		}

		checkLeasesSignaled = false;
	}	// end of handling check leases


	// AddJobs
	/////////////////////////////////////////////////////
	if ( addJobsSignaled || firstScheddContact ) {
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for new jobs\n" );

		// Make sure we grab all Globus Universe jobs (except held ones
		// that we previously indicated we were done with)
		// when we first start up in case we're recovering from a
		// shutdown/meltdown.
		// Otherwise, grab all jobs that are unheld and aren't marked as
		// currently being managed and aren't marked as not matched.
		// If JobManaged is undefined, equate it with false.
		// If Matched is undefined, equate it with true.
		// NOTE: Schedds from Condor 6.6 and earlier don't include
		//   "(Universe==9)" in the constraint they give to the gridmanager,
		//   so this gridmanager will pull down non-globus-universe ads,
		//   although it won't use them. This is inefficient but not
		//   incorrect behavior.
		if ( firstScheddContact ) {
			// Grab all jobs for us to manage. This expression is a
			// derivative of the expression below for new jobs. We add
			// "|| Managed =?= TRUE" to also get jobs our previous
			// incarnation was in the middle of managing when it died
			// (if it died unexpectedly). With the new term, the
			// "&& Managed =!= TRUE" from the new jobs expression becomes
			// superfluous (by boolean logic), so we drop it.
			sprintf( expr_buf,
					 "%s && %s && ((%s && %s) || %s)",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_managed.c_str()
					 );
		} else {
			// Grab new jobs for us to manage
			sprintf( expr_buf,
					 "%s && %s && %s && %s && %s",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_not_managed.c_str()
					 );
		}
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *old_job;
			int job_is_matched = 1;		// default to true if not in ClassAd

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			bool job_is_managed = jobExternallyManaged(next_ad);
			next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched);

			if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) {

				JobType *job_type = NULL;
				BaseJob *new_job = NULL;

				// job had better be either managed or matched! (or both)
				ASSERT( job_is_managed || job_is_matched );

				if ( MustExpandJobAd( next_ad ) ) {
					// Get the expanded ClassAd from the schedd, which
					// has the GridResource filled in with info from
					// the matched ad.
					delete next_ad;
					next_ad = NULL;
					next_ad = GetJobAd(procID.cluster,procID.proc);
					if ( next_ad == NULL && errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
					if ( next_ad == NULL ) {
						// We may get here if it was not possible to expand
						// one of the $$() expressions.  We don't want to
						// roll back the transaction and blow away the
						// hold that the schedd just put on the job, so
						// simply skip over this ad.
						dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d.  errno=%d\n",procID.cluster,procID.proc,errno);
						goto contact_schedd_next_add_job;
					}
				}

				// Search our job types for one that'll handle this job
				jobTypes.Rewind();
				while ( jobTypes.Next( job_type ) ) {
					if ( job_type->AdMatchFunc( next_ad ) ) {

						// Found one!
						dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n",
								 job_type->Name, procID.cluster, procID.proc );
						break;
					}
				}

				if ( job_type != NULL ) {
					new_job = job_type->CreateFunc( next_ad );
				} else {
					dprintf( D_ALWAYS, "No handlers for job %d.%d\n",
							 procID.cluster, procID.proc );
					new_job = new BaseJob( next_ad );
				}

				ASSERT(new_job);
				new_job->SetEvaluateState();
				dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n",
						new_job->procID.cluster,new_job->procID.proc);
				num_ads++;

				if ( !job_is_managed ) {
					rc = tSetAttributeString( new_job->procID.cluster,
									   new_job->procID.proc,
									   ATTR_JOB_MANAGED,
									   MANAGED_EXTERNAL);
					if ( rc < 0 ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
				}

			} else {

				// We already know about this job, skip
				// But also set Managed=true on the schedd so that it won't
				// keep signalling us about it
				delete next_ad;
				rc = tSetAttributeString( procID.cluster, procID.proc,
								   ATTR_JOB_MANAGED, MANAGED_EXTERNAL );
				if ( rc < 0 ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				}

			}

contact_schedd_next_add_job:
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}	// end of while next_ad
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads);
	}	// end of handling add jobs


	// RemoveJobs
	/////////////////////////////////////////////////////

	// We always want to perform this check. Otherwise, we may overwrite a
	// REMOVED/HELD/COMPLETED status with something else below.
	{
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" );

		// Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we
		// haven't previously indicated that we're done with (by setting
		// JobManaged to "Schedd".
		sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))",
				 ScheddJobConstraint, expr_not_completely_done.c_str(),
				 ATTR_JOB_STATUS, REMOVED,
				 ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD,
				 ATTR_JOB_MANAGED, MANAGED_EXTERNAL );

		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *next_job;
			int curr_status;

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status );

			if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) {
				// Should probably skip jobs we already have marked as
				// held or removed

				next_job->JobAdUpdateFromSchedd( next_ad, true );
				num_ads++;

			} else if ( curr_status == REMOVED ) {

				// If we don't know about the job, act like we got an
				// ADD_JOBS signal from the schedd the next time we
				// connect, so that we'll create a Job object for it
				// and decide how it needs to be handled.
				// TODO The AddJobs and RemoveJobs queries shoule be
				//   combined into a single query.
				dprintf( D_ALWAYS, 
						 "Don't know about removed job %d.%d. "
						 "Will treat it as a new job to manage\n",
						 procID.cluster, procID.proc );
				addJobsSignaled = true;

			} else {

				dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. "
						 "Ignoring it\n",
						 procID.cluster, procID.proc );

			}

			delete next_ad;
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads);
	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	add_remove_jobs_complete = true;


	// Retrieve dirty attributes
	/////////////////////////////////////////////////////
	if ( updateJobsSignaled ) {
		dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" );

		sprintf( expr_buf, "%s && %s && %s && %s",
				 expr_schedd_job_constraint.c_str(), 
				 expr_not_completely_done.c_str(),
				 expr_not_held.c_str(),
				 expr_managed.c_str()
				 );
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			ClassAd updates;
			char str[PROC_ID_STR_BUFLEN];
			next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc );
			if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) {
				dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc );
				failure_line_num = __LINE__;
				delete next_ad;
				goto contact_schedd_disconnect;
		        }
			else {
				dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc);
				dPrintAd(D_JOB, updates);
			}
			if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
				curr_job->JobAdUpdateFromSchedd( &updates, false );
				ProcIdToStr( job_id, str );
				dirty_job_ids.append( str );
			}
			else {
				dprintf( D_ALWAYS, "Don't know about updated job %d.%d. "
						 "Ignoring it\n",
						 job_id.cluster, job_id.proc );
			}
			delete next_ad;
			next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 );
		}
	}
	update_jobs_complete = true;

//	if ( BeginTransaction() < 0 ) {
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}


	// requestJobStatus
	/////////////////////////////////////////////////////
	if ( pendingJobStatus.getNumElements() != 0 ) {
		JobStatusRequest curr_request;

		pendingJobStatus.startIterations();
		while ( pendingJobStatus.iterate( curr_request ) != 0 ) {

			int status;

			rc = GetAttributeInt( curr_request.job_id.cluster,
								  curr_request.job_id.proc,
								  ATTR_JOB_STATUS, &status );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so return a job status of REMOVED.
					status = REMOVED;
				}
			}
				// return status
			dprintf( D_FULLDEBUG, "%d.%d job status: %d\n",
					 curr_request.job_id.cluster,
					 curr_request.job_id.proc, status );
			pendingJobStatus.remove( curr_request.job_id );
			curr_request.job_status = status;
			daemonCore->Reset_Timer( curr_request.tid, 0 );
			completedJobStatus.insert( curr_request.job_id,
									   curr_request );
		}

	}


	// Update existing jobs
	/////////////////////////////////////////////////////
	ScheddUpdateRequest *curr_request;
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n",
				curr_job->procID.cluster, curr_job->procID.proc);
		const char *attr_name;
		const char *attr_value;
		ExprTree *expr;
		bool fake_job_in_queue = false;
		curr_job->jobAd->ResetExpr();
		while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true &&
				fake_job_in_queue == false ) {
			attr_value = ExprTreeToString( expr );

			dprintf(D_FULLDEBUG,"   %s = %s\n",attr_name,attr_value);
			rc = SetAttribute( curr_job->procID.cluster,
							   curr_job->procID.proc,
							   attr_name,
							   attr_value);
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so pretend that all updates for the job succeed.
						// Otherwise, we'll never make forward progress on
						// the job.
						// TODO We should also fake a job status of REMOVED
						//   to the job, so it can do what cleanup it can.
					fake_job_in_queue = true;
					break;
				}
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_updates_complete = true;


	// Delete existing jobs
	/////////////////////////////////////////////////////
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		if ( curr_job->deleteFromSchedd ) {
			dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n",
					curr_job->procID.cluster, curr_job->procID.proc);
			rc = DestroyProc(curr_job->procID.cluster,
							 curr_job->procID.proc);
				// NOENT means the job doesn't exist.  Good enough for us.
			if ( rc < 0 && rc != DESTROYPROC_ENOENT) {
				failure_line_num = __LINE__;
				commit_transaction = false;
				goto contact_schedd_disconnect;
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_deletes_complete = true;


 contact_schedd_disconnect:
	DisconnectQ( schedd, commit_transaction );

	if ( add_remove_jobs_complete == true ) {
		firstScheddContact = false;
		addJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( update_jobs_complete == true ) {
		updateJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( schedd_updates_complete == false ) {
		formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	// Clear dirty bits for all jobs updated
	if ( !dirty_job_ids.isEmpty() ) {
		ClassAd *rval;
		dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n",
				 dirty_job_ids.number() );
		dirty_job_ids.rewind();
		rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack );
		if ( rval == NULL ) {
			dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes.  CondorError: %s\n", errstack.getFullText().c_str() );
		}
		delete rval;
	}

	// Wake up jobs that had schedd updates pending and delete job
	// objects that wanted to be deleted
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		curr_job->jobAd->ClearAllDirtyFlags();

		if ( curr_job->deleteFromGridmanager ) {

				// If the Job object wants to delete the job from the
				// schedd but we failed to do so, don't delete the job
				// object yet; wait until we successfully delete the job
				// from the schedd.
			if ( curr_job->deleteFromSchedd == true &&
				 schedd_deletes_complete == false ) {
				continue;
			}

				// If wantRematch is set, send a reschedule now
			if ( curr_job->wantRematch ) {
				send_reschedule = true;
			}
			pendingScheddUpdates.remove( curr_job->procID );
			pendingScheddVacates.remove( curr_job->procID );
			pendingJobStatus.remove( curr_job->procID );
			completedJobStatus.remove( curr_job->procID );
			completedScheddVacates.remove( curr_job->procID );
			delete curr_job;

		} else {
			pendingScheddUpdates.remove( curr_job->procID );

			if ( curr_request->m_notify ) {
				curr_job->SetEvaluateState();
			}
		}

		delete curr_request;
	}

	// Poke objects that wanted to be notified when a schedd update completed
	// successfully (possibly minus deletes)
	int timer_id;
	scheddUpdateNotifications.Rewind();
	while ( scheddUpdateNotifications.Next( timer_id ) ) {
		daemonCore->Reset_Timer( timer_id, 0 );
	}
	scheddUpdateNotifications.Clear();

	if ( send_reschedule == true ) {
		ScheddObj->reschedule();
	}

	// Check if we have any jobs left to manage. If not, exit.
	if ( BaseJob::JobsByProcId.getNumElements() == 0 ) {
		dprintf( D_ALWAYS, "No jobs left, shutting down\n" );
		daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM );
	}

	lastContactSchedd = time(NULL);

	if ( schedd_deletes_complete == false ) {
		error_str = "Problem using DestroyProc to delete jobs!";
		goto contact_schedd_failure;
	}

	scheddFailureCount = 0;

	// For each job that had dirty attributes, re-evaluate the policy
	dirty_job_ids.rewind();
	while ( (job_id_str = dirty_job_ids.next()) != NULL ) {
		StrToProcIdFixMe(job_id_str, job_id);
		if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
			curr_job->EvalPeriodicJobExpr();
		}
	}

dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n");
	return;

 contact_schedd_failure:
	scheddFailureCount++;
	if ( error_str == "" ) {
		error_str = "Failure in doContactSchedd";
	}
	if ( scheddFailureCount >= maxScheddFailures ) {
		dprintf( D_ALWAYS, "%s\n", error_str.c_str() );
		EXCEPT( "Too many failures connecting to schedd!" );
	}
	dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() );
	lastContactSchedd = time(NULL);
	RequestContactSchedd();
	return;
}
Esempio n. 9
0
bool
DCStarter::peek(bool transfer_stdout, ssize_t &stdout_offset, bool transfer_stderr, ssize_t &stderr_offset, const std::vector<std::string> &filenames, std::vector<ssize_t> &offsets, size_t max_bytes, bool &retry_sensible, PeekGetFD &next, std::string &error_msg, unsigned timeout, const std::string &sec_session_id, DCTransferQueue *xfer_q)
{
	compat_classad::ClassAd ad;
	ad.InsertAttr(ATTR_JOB_OUTPUT, transfer_stdout);
	ad.InsertAttr("OutOffset", stdout_offset);
	ad.InsertAttr(ATTR_JOB_ERROR, transfer_stderr);
	ad.InsertAttr("ErrOffset", stderr_offset);
	ad.InsertAttr(ATTR_VERSION, CondorVersion());

	size_t total_files = 0;
	total_files += transfer_stdout ? 1 : 0;
	total_files += transfer_stderr ? 1 : 0;

	if (filenames.size())
	{
		total_files += filenames.size();
		std::vector<classad::ExprTree *> filelist; filelist.reserve(filenames.size());
		std::vector<classad::ExprTree *> offsetlist; offsetlist.reserve(filenames.size());
		std::vector<ssize_t>::const_iterator it2 = offsets.begin();
		for (std::vector<std::string>::const_iterator it = filenames.begin();
			it != filenames.end() && it2 != offsets.end();
			it++, it2++)
		{
			classad::Value value;
			value.SetStringValue(*it);
			filelist.push_back(classad::Literal::MakeLiteral(value));
			value.SetIntegerValue(*it2);
			offsetlist.push_back(classad::Literal::MakeLiteral(value));
		}
		classad::ExprTree *list(classad::ExprList::MakeExprList(filelist));
		ad.Insert("TransferFiles", list);
		list = classad::ExprList::MakeExprList(offsetlist);
		ad.Insert("TransferOffsets", list);
	}

	ad.InsertAttr(ATTR_MAX_TRANSFER_BYTES, static_cast<long long>(max_bytes));

	ReliSock sock;

	if( !connectSock(&sock, timeout, NULL) ) {
		error_msg = "Failed to connect to starter";
		return false;
	}

	if( !startCommand(STARTER_PEEK, &sock, timeout, NULL, NULL, false, sec_session_id.c_str()) ) {
		error_msg = "Failed to send START_PEEK to starter";
		return false;
	}
	sock.encode();
	if (!putClassAd(&sock, ad) || !sock.end_of_message()) {
		error_msg = "Failed to send request to starter";
		return false;
	}

	compat_classad::ClassAd response;
	sock.decode();
	if (!getClassAd(&sock, response) || !sock.end_of_message())
	{
		error_msg = "Failed to read response for peeking at logs.";
		return false;
	}
	dPrintAd(D_FULLDEBUG, response);

	bool success = false;
	if (!response.EvaluateAttrBool(ATTR_RESULT, success) || !success)
	{
		response.EvaluateAttrBool(ATTR_RETRY, retry_sensible);
		error_msg = "Remote operation failed.";
		response.EvaluateAttrString(ATTR_ERROR_STRING, error_msg);
		return false;
	}
	classad::Value valueX;
	classad_shared_ptr<classad::ExprList> list;
	if (!response.EvaluateAttr("TransferFiles", valueX) || !valueX.IsSListValue(list))
	{
		error_msg = "Unable to evaluate starter response";
		return false;
	}
	classad_shared_ptr<classad::ExprList> offlist;
	if (!response.EvaluateAttr("TransferOffsets", valueX) || !valueX.IsSListValue(offlist))
	{
		error_msg = "Unable to evaluate starter response (missing offsets)";
		return false;
	}

	size_t remaining = max_bytes;
	size_t file_count = 0;
	classad::ExprList::const_iterator it2 = offlist->begin();
	for (classad::ExprList::const_iterator it = list->begin();
		it != list->end() && it2 != offlist->end();
		it++, it2++)
	{
		classad::Value value;
		(*it2)->Evaluate(value);
		off_t off = -1;
		value.IsIntegerValue(off);
		(*it)->Evaluate(value);
		std::string filename;
		int64_t xfer_fd = -1;
		if (!value.IsStringValue(filename) && value.IsIntegerValue(xfer_fd))
		{
			if (xfer_fd == 0) filename = "_condor_stdout";
			if (xfer_fd == 1) filename = "_condor_stderr";
		}
		int fd = next.getNextFD(filename);
		filesize_t size = -1;
		int retval;
		if ((retval = sock.get_file(&size, fd, false, false, remaining, xfer_q)) && (retval != GET_FILE_MAX_BYTES_EXCEEDED))
		{
			error_msg = "Internal error when transferring file " + filename;
		}
		else if (size >= 0)
		{
			remaining -= max_bytes;
			file_count++;
			off += size;
		}
		else
		{
			error_msg = "Failed to transfer file " + filename;
		}
		if (xfer_fd == 0)
		{
			stdout_offset = off;
			//dprintf(D_FULLDEBUG, "New stdout offset: %ld\n", stdout_offset);
		}
		else if (xfer_fd == 1)
		{
			stderr_offset = off;
		}
		else
		{
			std::vector<ssize_t>::iterator it4 = offsets.begin();
			for (std::vector<std::string>::const_iterator it3 = filenames.begin();
				it3 != filenames.end() && it4 != offsets.end();
				it3++, it4++)
			{
				if (*it3 == filename) *it4 = off;
			}
		}
	}
	size_t remote_file_count;
	if (!sock.get(remote_file_count) || !sock.end_of_message())
	{
		error_msg = "Unable to get remote file count.";
		return false;
	}
	if (file_count != remote_file_count)
	{
		std::stringstream ss;
		ss << "Recieved " << file_count << " files, but remote side thought it sent " << remote_file_count << " files";
		error_msg = ss.str();
		return false;
	}
	if ((total_files != file_count) && !error_msg.size())
	{
		error_msg = "At least one file transfer failed.";
		return false;
	}
	return true;
}
Esempio n. 10
0
void
XferSummary::time_out(time_t now, char *hostaddr)
{
	ClassAd	   	info;
	char		line[128], *tmp;
	char		*str = NULL;

	SetMyTypeName(info, "CkptServer");
	SetTargetTypeName(info, "CkptFile");

	sprintf(line, "%s = \"%s\"", ATTR_NAME, get_local_fqdn().Value());
	info.Insert(line);
    sprintf(line, "%s = \"%s\"", ATTR_MACHINE, hostaddr );
	info.Insert(line);
	sprintf(line, "%s = \"%s\"", ATTR_VERSION, CondorVersion() );
	info.Insert(line);
	sprintf(line, "%s = \"%s\"", ATTR_PLATFORM, CondorPlatform() );
	info.Insert(line);
	sprintf(line, "NumSends = %d", num_sends);
	info.Insert(line);
	sprintf(line, "BytesSent = %d", (int) bytes_sent);
	info.Insert(line);
	sprintf(line, "TimeSending = %d", time_sending);
	info.Insert(line);
	sprintf(line, "AvgSendBandwidth = %f", num_sends ?
			tot_send_bandwidth / num_sends : 0.0);
	info.Insert(line);
	sprintf(line, "NumRecvs = %d", num_recvs);
	info.Insert(line);
	sprintf(line, "BytesReceived = %d", (int) bytes_recv);
	info.Insert(line);
	sprintf(line, "TimeReceiving = %d", time_recving);
	info.Insert(line);
	sprintf(line, "AvgReceiveBandwidth = %f", num_recvs ?
			tot_recv_bandwidth / num_recvs : 0.0);
	info.Insert(line);

	/* ctime adds a newline at the end of the ascii conversion.... */
	str = ctime(&start_time);
	sprintf(line, "CkptServerIntervalStart = \"%s\"", str ? str : "Unknown\n");
	tmp = strchr( line, '\n' );
	if (tmp != NULL) {
		/* delete the newline */
		*tmp = '\"';
		tmp++;
		*tmp = '\0';
	}
	info.Insert(line);

	/* ctime adds a newline at the end of the ascii conversion.... */
	str = ctime(&now);
	sprintf(line, "CkptServerIntervalEnd = \"%s\"", str ? str : "Unknown\n");
	tmp = strchr( line, '\n' );
	if (tmp != NULL) {
		/* delete the newline */
		*tmp = '\"';
		tmp++;
		*tmp = '\0';
	}
	info.Insert(line);

	info.Assign("Disk", sysapi_disk_space(pwd.Value()));
	
	// Send to collector
	if ( Collectors ) {
        dprintf(D_NETWORK, "Sending CkptServer ClassAd:\n");
        dPrintAd(D_NETWORK, info);
		Collectors->sendUpdates (UPDATE_CKPT_SRVR_AD, &info, NULL, true);
	}

	init();
}
Esempio n. 11
0
ClassAd* 
readJobAd( void )
{
	ClassAd* ad = NULL;
	bool is_stdin = false;
	bool read_something = false;

	ASSERT( job_ad_file );

	if( job_ad_file[0] == '-' && job_ad_file[1] == '\0' ) {
		fp = stdin;
		is_stdin = true;
	} else {
		if (fp == NULL) {
			fp = safe_fopen_wrapper_follow( job_ad_file, "r" );
			if( ! fp ) {
				EXCEPT( "Failed to open ClassAd file (%s): %s (errno %d)",
						job_ad_file, strerror(errno), errno );
			}
		}
	}

	dprintf( D_FULLDEBUG, "Reading job ClassAd from %s\n",
			 is_stdin ? "STDIN" : job_ad_file );

	ad = new ClassAd;
	MyString line;
	while( line.readLine(fp) ) {
        read_something = true;
		line.chomp();
		if( line[0] == '#' ) {
			dprintf( D_JOB, "IGNORING COMMENT: %s\n", line.Value() );
			continue;
		}
		if( line == "***" ) {
			dprintf( D_JOB, "Saw ClassAd delimitor, stopping\n" );
			break;
		}
        if( ! ad->Insert(line.Value()) ) {
			EXCEPT( "Failed to insert \"%s\" into ClassAd!", line.Value() );
        }
    }
	if( ! read_something ) {
		EXCEPT( "reading ClassAd from (%s): file is empty",
				is_stdin ? "STDIN" : job_ad_file );
	}
	if( IsDebugVerbose(D_JOB) ) {
		dPrintAd( D_JOB, *ad );
	} 

	// For debugging, see if there's a special attribute in the
	// job ad that sends us into an infinite loop, waiting for
	// someone to attach with a debugger
	int shadow_should_wait = 0;
	ad->LookupInteger( ATTR_SHADOW_WAIT_FOR_DEBUG,
					   shadow_should_wait );
	if( shadow_should_wait ) {
		dprintf( D_ALWAYS, "Job requested shadow should wait for "
			"debugger with %s=%d, going into infinite loop\n",
			ATTR_SHADOW_WAIT_FOR_DEBUG, shadow_should_wait );
		while( shadow_should_wait ) { }
	}

	return ad;
}
Esempio n. 12
0
bool CollectorEngine::ValidateClassAd(int command,ClassAd *clientAd,Sock *sock)
{

	if( !m_collector_requirements ) {
			// no need to do any of the following checks if the admin has
			// not configured any COLLECTOR_REQUIREMENTS
		return true;
	}


	char const *ipattr = NULL;
	switch( command ) {
	  case MERGE_STARTD_AD:
	  case UPDATE_STARTD_AD:
	  case UPDATE_STARTD_AD_WITH_ACK:
		  ipattr = ATTR_STARTD_IP_ADDR;
		  break;
	  case UPDATE_SCHEDD_AD:
	  case UPDATE_SUBMITTOR_AD:
		  ipattr = ATTR_SCHEDD_IP_ADDR;
		  break;
	  case UPDATE_MASTER_AD:
		  ipattr = ATTR_MASTER_IP_ADDR;
		  break;
	  case UPDATE_NEGOTIATOR_AD:
		  ipattr = ATTR_NEGOTIATOR_IP_ADDR;
		  break;
	  case UPDATE_COLLECTOR_AD:
		  ipattr = ATTR_COLLECTOR_IP_ADDR;
		  break;
	  case UPDATE_LICENSE_AD:
	  case UPDATE_CKPT_SRVR_AD:
	  case UPDATE_STORAGE_AD:
	  case UPDATE_HAD_AD:
	  case UPDATE_AD_GENERIC:
      case UPDATE_GRID_AD:
	  case UPDATE_ACCOUNTING_AD:
	  default:
		  break;
	}

	if(ipattr) {
		MyString my_address;
		MyString subsys_ipaddr;

			// Some ClassAds contain two copies of the IP address,
			// one named "MyAddress" and one named "<SUBSYS>IpAddr".
			// If the latter exists, then it _must_ match the former,
			// because people may be filtering in COLLECTOR_REQUIREMENTS
			// on MyAddress, and we don't want them to have to worry
			// about filtering on the older cruftier <SUBSYS>IpAddr.

		if( clientAd->LookupString( ipattr, subsys_ipaddr ) ) {
			clientAd->LookupString( ATTR_MY_ADDRESS, my_address );
			if( my_address != subsys_ipaddr ) {
				dprintf(D_ALWAYS,
				        "%s VIOLATION: ClassAd from %s advertises inconsistent"
				        " IP addresses: %s=%s, %s=%s\n",
				        COLLECTOR_REQUIREMENTS,
				        (sock ? sock->get_sinful_peer() : "(NULL)"),
				        ipattr, subsys_ipaddr.Value(),
				        ATTR_MY_ADDRESS, my_address.Value());
				return false;
			}
		}
	}


		// Now verify COLLECTOR_REQUIREMENTS
	bool collector_req_result = false;
	if( !EvalBool(COLLECTOR_REQUIREMENTS,m_collector_requirements,clientAd,collector_req_result) ) {
		dprintf(D_ALWAYS,"WARNING: %s did not evaluate to a boolean result.\n",COLLECTOR_REQUIREMENTS);
		collector_req_result = false;
	}
	if( !collector_req_result ) {
		static int details_shown=0;
		bool show_details = (details_shown<10) || IsFulldebug(D_FULLDEBUG);
		dprintf(D_ALWAYS,"%s VIOLATION: requirements do not match ad from %s.%s\n",
				COLLECTOR_REQUIREMENTS,
				sock ? sock->get_sinful_peer() : "(null)",
				show_details ? " Contents of the ClassAd:" : " (turn on D_FULLDEBUG to see details)");
		if( show_details ) {
			details_shown += 1;
			dPrintAd(D_ALWAYS, *clientAd);
		}

		return false;
	}

	return true;
}
Esempio n. 13
0
ClassAd *
process_request(const ClassAd *inputAd)
{
	static unsigned int req_number = 0;

		// Number each new request.
	ClassAd *resultAd = new ClassAd();
	ASSERT(resultAd);
	resultAd->Assign("REQUEST_NUMBER",++req_number);

	dprintf(D_ALWAYS,"----------------------------------------\nProcessing request %d\n",req_number);

	dprintf(D_FULLDEBUG,"Contents of request classad:\n");
	if ( inputAd ) {
		dPrintAd(D_FULLDEBUG, *(ClassAd*)inputAd);
	}

		// Create two temp dirs, one to serve as the iwd for the command, another
		// to hold the stdout/err.
	char *iwd = create_temp_file(true);
	char *stdio_iwd = create_temp_file(true);
	if (!iwd || !stdio_iwd) {
		handle_process_request_error("failed to create temp dirs",req_number,resultAd);
		return resultAd;
	}

		// Do the work.
	do_process_request(inputAd, resultAd, req_number, iwd, stdio_iwd);

		// Blow away our temp dirs unless we are in debug mode
	bool debug_mode = param_boolean("SOAPSHELL_DEBUG_MODE",false);
	if ( !debug_mode ) {
		if ( iwd ) {
			Directory dir(iwd);
			dir.Remove_Full_Path(iwd);
			free(iwd);
		}
		if ( stdio_iwd ) {
			Directory dir(stdio_iwd);
			dir.Remove_Full_Path(stdio_iwd);
			free(stdio_iwd);
		}
	} else {
		if ( iwd ) {
			dprintf(D_ALWAYS,"SOAPSHELL_DEBUG_MODE=True so not removing iwd %s\n",
					iwd);
			free(iwd);
		}
		if ( stdio_iwd ) {
			dprintf(D_ALWAYS,"SOAPSHELL_DEBUG_MODE=True so not removing stdio_iwd %s\n",
					stdio_iwd);
			free(stdio_iwd);
		}
	}

	dprintf(D_FULLDEBUG,"Contents of result classad:\n");
	dPrintAd(D_FULLDEBUG, *resultAd);

	dprintf(D_ALWAYS,"Finished processing request %d\n",req_number);
	return resultAd;
}
Esempio n. 14
0
// fetch all ads from the collector that satisfy the constraints
QueryResult CondorQuery::
fetchAds (ClassAdList &adList, const char *poolName, CondorError* errstack)
{
	Sock*    sock; 
	int                     more;
	QueryResult result;
	ClassAd     queryAd(extraAttrs), *ad;

	if ( !poolName ) {
		return Q_NO_COLLECTOR_HOST;
	}

        // contact collector
	Daemon my_collector( DT_COLLECTOR, poolName, NULL );
	if( !my_collector.locate() ) {
			// We were passed a bogus poolName, abort gracefully
		return Q_NO_COLLECTOR_HOST;
	}


	// make the query ad
	result = getQueryAd (queryAd);
	if (result != Q_OK) return result;

	if( IsDebugLevel( D_HOSTNAME ) ) {
		dprintf( D_HOSTNAME, "Querying collector %s (%s) with classad:\n", 
				 my_collector.addr(), my_collector.fullHostname() );
		dPrintAd( D_HOSTNAME, queryAd );
		dprintf( D_HOSTNAME, " --- End of Query ClassAd ---\n" );
	}


	int mytimeout = param_integer ("QUERY_TIMEOUT",60); 
	if (!(sock = my_collector.startCommand(command, Stream::reli_sock, mytimeout, errstack)) ||
	    !putClassAd (sock, queryAd) || !sock->end_of_message()) {

		if (sock) {
			delete sock;
		}
		return Q_COMMUNICATION_ERROR;
	}
	
	// get result
	sock->decode ();
	more = 1;
	while (more)
	{
		if (!sock->code (more)) {
			sock->end_of_message();
			delete sock;
			return Q_COMMUNICATION_ERROR;
		}
		if (more) {
			ad = new ClassAd;
			if( !getClassAd(sock, *ad) ) {
				sock->end_of_message();
				delete ad;
				delete sock;
				return Q_COMMUNICATION_ERROR;
			}
			adList.Insert (ad);
		}
	}
	sock->end_of_message();

	// finalize
	sock->close();
	delete sock;
	
	return (Q_OK);
}