Ejemplo n.º 1
0
void CondorResource::DoScheddPoll()
{
	int rc;
	ScheddPollInfo *poll_info = NULL;

	if ( ( registeredJobs.IsEmpty() || resourceDown ) &&
		 scheddStatusActive == false ) {
			// No jobs or we can't talk to the schedd, so no point
			// in polling
		daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
		return;
	}

	if ( gahp->isStarted() == false ) {
		// The gahp isn't started yet. Wait a few seconds for a CondorJob
		// object to start it (and possibly initialize x509 credentials).
		daemonCore->Reset_Timer( scheddPollTid, 5 );
		return;
	}

	PollInfoByName.lookup( HashKey( HashName( scheddName, poolName, NULL ) ),
						   poll_info );

	daemonCore->Reset_Timer( scheddPollTid, TIMER_NEVER );

	if ( scheddStatusActive == false ) {

			// We share polls across all CondorResource objects going to
			// the same schedd. If another object has done a poll
			// recently, then don't bother doing one ourselves.
		if ( poll_info  == NULL ) {
			poll_info = new ScheddPollInfo;
			poll_info->m_lastPoll = 0;
			poll_info->m_pollActive = false;
			PollInfoByName.insert( HashKey( HashName( scheddName, poolName,
													  NULL ) ),
								   poll_info );
		}

		if ( poll_info->m_pollActive == true ||
			 poll_info->m_lastPoll + BatchStatusInterval() > time(NULL) ) {
			daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
			return;
		}

			// start schedd status command
		dprintf( D_FULLDEBUG, "Starting collective poll: %s\n",
				 scheddName );
		std::string constraint;

			// create a list of jobs we expect to hear about in our
			// status command
			// Since we're sharing the results of this status command with
			// all CondorResource objects going to the same schedd, look
			// for their jobs as well.
		poll_info->m_submittedJobs.Rewind();
		while ( poll_info->m_submittedJobs.Next() ) {
			poll_info->m_submittedJobs.DeleteCurrent();
		}
		CondorResource *next_resource;
		BaseJob *job;
		std::string job_id;
		ResourcesByName.startIterations();
		while ( ResourcesByName.iterate( next_resource ) != 0 ) {
			if ( strcmp( scheddName, next_resource->scheddName ) ||
				 strcmp( poolName ? poolName : "",
						 next_resource->poolName ? next_resource->poolName : "" ) ) {
				continue;
			}

			next_resource->registeredJobs.Rewind();
			while ( ( job = next_resource->registeredJobs.Next() ) ) {
				if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) {
					poll_info->m_submittedJobs.Append( (CondorJob *)job );
				}
			}
		}

		formatstr( constraint, "(%s)", submitter_constraint.c_str() );

		rc = gahp->condor_job_status_constrained( scheddName,
												  constraint.c_str(),
												  NULL, NULL );

		if ( rc != GAHPCLIENT_COMMAND_PENDING ) {
			dprintf( D_ALWAYS,
					 "gahp->condor_job_status_constrained returned %d for remote schedd: %s\n",
					 rc, scheddName );
			EXCEPT( "condor_job_status_constrained failed!" );
		}
		scheddStatusActive = true;
		poll_info->m_pollActive = true;

	} else {

			// finish schedd status command
		int num_status_ads;
		ClassAd **status_ads = NULL;

		ASSERT( poll_info );

		rc = gahp->condor_job_status_constrained( NULL, NULL,
												  &num_status_ads,
												  &status_ads );

		if ( rc == GAHPCLIENT_COMMAND_PENDING ) {
			return;
		} else if ( rc != 0 ) {
			dprintf( D_ALWAYS,
					 "gahp->condor_job_status_constrained returned %d for remote schedd %s\n",
					 rc, scheddName );
			dprintf( D_ALWAYS, "Requesting ping of resource\n" );
			RequestPing( NULL );
		}

		if ( rc == 0 ) {
			for ( int i = 0; i < num_status_ads; i++ ) {
				int cluster, proc;
				int rc2;
				std::string job_id_string;
				BaseJob *base_job = NULL;
				CondorJob *job;

				if( status_ads[i] == NULL ) {
					dprintf(D_ALWAYS, "DoScheddPoll was given null pointer for classad #%d\n", i);
					continue;
				}

				status_ads[i]->LookupInteger( ATTR_CLUSTER_ID, cluster );
				status_ads[i]->LookupInteger( ATTR_PROC_ID, proc );

				formatstr( job_id_string, "condor %s %s %d.%d", scheddName,
									   poolName, cluster, proc );

				rc2 = BaseJob::JobsByRemoteId.lookup( HashKey( job_id_string.c_str() ),
													  base_job );
				job = dynamic_cast<CondorJob*>( base_job );
				if ( rc2 == 0 ) {
					job->NotifyNewRemoteStatus( status_ads[i] );
					poll_info->m_submittedJobs.Delete( job );
				} else {
					delete status_ads[i];
				}
			}

			poll_info->m_lastPoll = time(NULL);
		}
		poll_info->m_pollActive = false;

		if ( status_ads != NULL ) {
			free( status_ads );
		}

			// Check if any jobs were missing from the status result
		if ( rc == 0 ) {
			CondorJob *job;
			std::string job_id;
			poll_info->m_submittedJobs.Rewind();
			while ( ( job = poll_info->m_submittedJobs.Next() ) ) {
				if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) {
						// We should have gotten a status ad for this job,
						// but didn't. Tell the job that there may be
						// something wrong by giving it a NULL status ad.
					job->NotifyNewRemoteStatus( NULL );
				}
				poll_info->m_submittedJobs.DeleteCurrent();
			}
		}

		scheddStatusActive = false;

		dprintf( D_FULLDEBUG, "Collective poll complete: %s\n", scheddName );

		daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() );
	}
}
Ejemplo n.º 2
0
void NordugridResource::DoJobStatus()
{
	if ( ( registeredJobs.IsEmpty() || resourceDown ) &&
		 m_jobStatusActive == false ) {
			// No jobs or we can't talk to the resource, so no point
			// in polling
		daemonCore->Reset_Timer( m_jobStatusTid, NordugridJob::probeInterval );
		return;
	}

	if ( m_statusGahp->isStarted() == false ) {
		// The gahp isn't started yet. Wait a few seconds for a Job
		// object to start it and initialize x509 credentials.
		daemonCore->Reset_Timer( m_jobStatusTid, 5 );
		return;
	}

	daemonCore->Reset_Timer( m_jobStatusTid, TIMER_NEVER );

	StringList results;

	if ( m_jobStatusActive == false ) {

			// start ldap status command
		dprintf( D_FULLDEBUG, "Starting ldap poll: %s\n", resourceName );

		std::string ldap_server = resourceName;
		size_t pos = ldap_server.find_first_of( ':' );
		if ( pos != std::string::npos ) {
			ldap_server.erase( pos );
		}

		std::string filter;
		sprintf( filter, "(&(objectclass=nordugrid-job)(nordugrid-job-globalowner=%s))", proxySubject );
		int rc = m_statusGahp->nordugrid_ldap_query( ldap_server.c_str(), "mds-vo-name=local,o=grid", filter.c_str(), "nordugrid-job-globalid,nordugrid-job-status",
													 results );
		if ( rc != GAHPCLIENT_COMMAND_PENDING ) {
			dprintf( D_ALWAYS,
					 "gahp->nordugrid_ldap_query returned %d for resource %s\n",
					 rc, resourceName );
			EXCEPT( "nordugrid_ldap_query failed!" );
		}
		m_jobStatusActive = true;

	} else {

			// finish ldap status command
		int rc = m_statusGahp->nordugrid_ldap_query( NULL, NULL, NULL, NULL, results );

		if ( rc == GAHPCLIENT_COMMAND_PENDING ) {
			return;
		} else if ( rc != 0 ) {
			dprintf( D_ALWAYS,
					 "gahp->nordugrid_ldap_query returned %d for resource %s: %s\n",
					 rc, resourceName, m_statusGahp->getErrorString() );
			dprintf( D_ALWAYS, "Requesting ping of resource\n" );
			RequestPing( NULL );
		}

		if ( rc == 0 ) {
			const char *next_job_id = NULL;
			const char *next_status = NULL;
			const char *next_attr;
			std::string key;

			results.rewind();
			do {
				next_attr = results.next();

				if ( next_attr != NULL && *next_attr != '\0' ) {
						// Save the attributes we're interested in
					if ( !strncmp( next_attr, "nordugrid-job-globalid: ", 24 ) ) {
						next_job_id = next_attr;
					} else if ( !strncmp( next_attr, "nordugrid-job-status: ", 22 ) ) {
						next_status = next_attr;
					}
					continue;
				}
					// We just reached the end of a record. Process it.
					// If we don't have the attributes we expect, skip it.
				if ( next_job_id && next_status ) {
					int rc2;
					NordugridJob *job;
					sprintf( key, "nordugrid %s %s", resourceName,
							 strrchr( next_job_id, '/' ) + 1 );
					rc2 = BaseJob::JobsByRemoteId.lookup( HashKey( key.c_str() ),
														  (BaseJob*&)job );
					if ( rc2 == 0 ) {
						job->NotifyNewRemoteStatus( strchr( next_status, ' ' ) + 1 );
					}
				}
				next_job_id = NULL;
				next_status = NULL;
			} while ( next_attr );

		}

		m_jobStatusActive = false;

		dprintf( D_FULLDEBUG, "ldap poll complete: %s\n", resourceName );

		daemonCore->Reset_Timer( m_jobStatusTid, NordugridJob::probeInterval );
	}
}