void CondorResource::DoScheddPoll()
{
	int rc;
	ScheddPollInfo *poll_info = NULL;

	if ( ( registeredJobs.IsEmpty() || resourceDown ) &&
		 scheddStatusActive == false ) {
			// No jobs or we can't talk to the schedd, so no point
			// in polling
		daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
		return;
	}

	if ( gahp->isStarted() == false ) {
		// The gahp isn't started yet. Wait a few seconds for a CondorJob
		// object to start it (and possibly initialize x509 credentials).
		daemonCore->Reset_Timer( scheddPollTid, 5 );
		return;
	}

	PollInfoByName.lookup( HashKey( HashName( scheddName, poolName, NULL ) ),
						   poll_info );

	daemonCore->Reset_Timer( scheddPollTid, TIMER_NEVER );

	if ( scheddStatusActive == false ) {

			// We share polls across all CondorResource objects going to
			// the same schedd. If another object has done a poll
			// recently, then don't bother doing one ourselves.
		if ( poll_info  == NULL ) {
			poll_info = new ScheddPollInfo;
			poll_info->m_lastPoll = 0;
			poll_info->m_pollActive = false;
			PollInfoByName.insert( HashKey( HashName( scheddName, poolName,
													  NULL ) ),
								   poll_info );
		}

		if ( poll_info->m_pollActive == true ||
			 poll_info->m_lastPoll + BatchStatusInterval() > time(NULL) ) {
			daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
			return;
		}

			// start schedd status command
		dprintf( D_FULLDEBUG, "Starting collective poll: %s\n",
				 scheddName );
		std::string constraint;

			// create a list of jobs we expect to hear about in our
			// status command
			// Since we're sharing the results of this status command with
			// all CondorResource objects going to the same schedd, look
			// for their jobs as well.
		poll_info->m_submittedJobs.Rewind();
		while ( poll_info->m_submittedJobs.Next() ) {
			poll_info->m_submittedJobs.DeleteCurrent();
		}
		CondorResource *next_resource;
		BaseJob *job;
		std::string job_id;
		ResourcesByName.startIterations();
		while ( ResourcesByName.iterate( next_resource ) != 0 ) {
			if ( strcmp( scheddName, next_resource->scheddName ) ||
				 strcmp( poolName ? poolName : "",
						 next_resource->poolName ? next_resource->poolName : "" ) ) {
				continue;
			}

			next_resource->registeredJobs.Rewind();
			while ( ( job = next_resource->registeredJobs.Next() ) ) {
				if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) {
					poll_info->m_submittedJobs.Append( (CondorJob *)job );
				}
			}
		}

		formatstr( constraint, "(%s)", submitter_constraint.c_str() );

		rc = gahp->condor_job_status_constrained( scheddName,
												  constraint.c_str(),
												  NULL, NULL );

		if ( rc != GAHPCLIENT_COMMAND_PENDING ) {
			dprintf( D_ALWAYS,
					 "gahp->condor_job_status_constrained returned %d for remote schedd: %s\n",
					 rc, scheddName );
			EXCEPT( "condor_job_status_constrained failed!" );
		}
		scheddStatusActive = true;
		poll_info->m_pollActive = true;

	} else {

			// finish schedd status command
		int num_status_ads;
		ClassAd **status_ads = NULL;

		ASSERT( poll_info );

		rc = gahp->condor_job_status_constrained( NULL, NULL,
												  &num_status_ads,
												  &status_ads );

		if ( rc == GAHPCLIENT_COMMAND_PENDING ) {
			return;
		} else if ( rc != 0 ) {
			dprintf( D_ALWAYS,
					 "gahp->condor_job_status_constrained returned %d for remote schedd %s\n",
					 rc, scheddName );
			dprintf( D_ALWAYS, "Requesting ping of resource\n" );
			RequestPing( NULL );
		}

		if ( rc == 0 ) {
			for ( int i = 0; i < num_status_ads; i++ ) {
				int cluster, proc;
				int rc2;
				std::string job_id_string;
				BaseJob *base_job = NULL;
				CondorJob *job;

				if( status_ads[i] == NULL ) {
					dprintf(D_ALWAYS, "DoScheddPoll was given null pointer for classad #%d\n", i);
					continue;
				}

				status_ads[i]->LookupInteger( ATTR_CLUSTER_ID, cluster );
				status_ads[i]->LookupInteger( ATTR_PROC_ID, proc );

				formatstr( job_id_string, "condor %s %s %d.%d", scheddName,
									   poolName, cluster, proc );

				rc2 = BaseJob::JobsByRemoteId.lookup( HashKey( job_id_string.c_str() ),
													  base_job );
				job = dynamic_cast<CondorJob*>( base_job );
				if ( rc2 == 0 ) {
					job->NotifyNewRemoteStatus( status_ads[i] );
					poll_info->m_submittedJobs.Delete( job );
				} else {
					delete status_ads[i];
				}
			}

			poll_info->m_lastPoll = time(NULL);
		}
		poll_info->m_pollActive = false;

		if ( status_ads != NULL ) {
			free( status_ads );
		}

			// Check if any jobs were missing from the status result
		if ( rc == 0 ) {
			CondorJob *job;
			std::string job_id;
			poll_info->m_submittedJobs.Rewind();
			while ( ( job = poll_info->m_submittedJobs.Next() ) ) {
				if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) {
						// We should have gotten a status ad for this job,
						// but didn't. Tell the job that there may be
						// something wrong by giving it a NULL status ad.
					job->NotifyNewRemoteStatus( NULL );
				}
				poll_info->m_submittedJobs.DeleteCurrent();
			}
		}

		scheddStatusActive = false;

		dprintf( D_FULLDEBUG, "Collective poll complete: %s\n", scheddName );

		daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
	}
}
Exemple #2
0
int BaseResource::DoBatchStatus()
{
	dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s.\n", ResourceName());

	if ( ( registeredJobs.IsEmpty() || resourceDown ) &&
		 m_batchStatusActive == false ) {
			// No jobs or we can't talk to the schedd, so no point
			// in polling
		daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
		dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s skipped for %d seconds because %s.\n", ResourceName(), BatchStatusInterval(), resourceDown ? "the resource is down":"there are no jobs registered");
		return 0;
	}

	GahpClient * gahp = BatchGahp();
	if ( gahp && gahp->isStarted() == false ) {
		int GAHP_INIT_DELAY = 5;
		dprintf( D_ALWAYS,"BaseResource::DoBatchStatus: gahp server not up yet, delaying %d seconds\n", GAHP_INIT_DELAY );
		daemonCore->Reset_Timer( m_batchPollTid, GAHP_INIT_DELAY );
		return 0;
	}

	daemonCore->Reset_Timer( m_batchPollTid, TIMER_NEVER );

	if(m_batchStatusActive == false) {
		dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Starting bulk job poll of %s\n", ResourceName());
		BatchStatusResult bsr = StartBatchStatus();
		switch(bsr) {
			case BSR_DONE:
				dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName());
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_ERROR:
				dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to start a bulk poll of %s\n", ResourceName());
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_PENDING:
				m_batchStatusActive = true;
				return 0;

			default:
				EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d", (int)bsr);
		}

	} else {
		BatchStatusResult bsr = FinishBatchStatus();
		switch(bsr) {
			case BSR_DONE:
				dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName());
				m_batchStatusActive = false;
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_ERROR:
				dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to finish a bulk poll of %s\n", ResourceName());
				m_batchStatusActive = false;
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_PENDING:
				return 0;

			default:
				EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d", (int)bsr);
		}
	}
	return 0;
}