void CondorResource::DoScheddPoll() { int rc; ScheddPollInfo *poll_info = NULL; if ( ( registeredJobs.IsEmpty() || resourceDown ) && scheddStatusActive == false ) { // No jobs or we can't talk to the schedd, so no point // in polling daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); return; } if ( gahp->isStarted() == false ) { // The gahp isn't started yet. Wait a few seconds for a CondorJob // object to start it (and possibly initialize x509 credentials). daemonCore->Reset_Timer( scheddPollTid, 5 ); return; } PollInfoByName.lookup( HashKey( HashName( scheddName, poolName, NULL ) ), poll_info ); daemonCore->Reset_Timer( scheddPollTid, TIMER_NEVER ); if ( scheddStatusActive == false ) { // We share polls across all CondorResource objects going to // the same schedd. If another object has done a poll // recently, then don't bother doing one ourselves. if ( poll_info == NULL ) { poll_info = new ScheddPollInfo; poll_info->m_lastPoll = 0; poll_info->m_pollActive = false; PollInfoByName.insert( HashKey( HashName( scheddName, poolName, NULL ) ), poll_info ); } if ( poll_info->m_pollActive == true || poll_info->m_lastPoll + BatchStatusInterval() > time(NULL) ) { daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); return; } // start schedd status command dprintf( D_FULLDEBUG, "Starting collective poll: %s\n", scheddName ); std::string constraint; // create a list of jobs we expect to hear about in our // status command // Since we're sharing the results of this status command with // all CondorResource objects going to the same schedd, look // for their jobs as well. poll_info->m_submittedJobs.Rewind(); while ( poll_info->m_submittedJobs.Next() ) { poll_info->m_submittedJobs.DeleteCurrent(); } CondorResource *next_resource; BaseJob *job; std::string job_id; ResourcesByName.startIterations(); while ( ResourcesByName.iterate( next_resource ) != 0 ) { if ( strcmp( scheddName, next_resource->scheddName ) || strcmp( poolName ? poolName : "", next_resource->poolName ? next_resource->poolName : "" ) ) { continue; } next_resource->registeredJobs.Rewind(); while ( ( job = next_resource->registeredJobs.Next() ) ) { if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) { poll_info->m_submittedJobs.Append( (CondorJob *)job ); } } } formatstr( constraint, "(%s)", submitter_constraint.c_str() ); rc = gahp->condor_job_status_constrained( scheddName, constraint.c_str(), NULL, NULL ); if ( rc != GAHPCLIENT_COMMAND_PENDING ) { dprintf( D_ALWAYS, "gahp->condor_job_status_constrained returned %d for remote schedd: %s\n", rc, scheddName ); EXCEPT( "condor_job_status_constrained failed!" ); } scheddStatusActive = true; poll_info->m_pollActive = true; } else { // finish schedd status command int num_status_ads; ClassAd **status_ads = NULL; ASSERT( poll_info ); rc = gahp->condor_job_status_constrained( NULL, NULL, &num_status_ads, &status_ads ); if ( rc == GAHPCLIENT_COMMAND_PENDING ) { return; } else if ( rc != 0 ) { dprintf( D_ALWAYS, "gahp->condor_job_status_constrained returned %d for remote schedd %s\n", rc, scheddName ); dprintf( D_ALWAYS, "Requesting ping of resource\n" ); RequestPing( NULL ); } if ( rc == 0 ) { for ( int i = 0; i < num_status_ads; i++ ) { int cluster, proc; int rc2; std::string job_id_string; BaseJob *base_job = NULL; CondorJob *job; if( status_ads[i] == NULL ) { dprintf(D_ALWAYS, "DoScheddPoll was given null pointer for classad #%d\n", i); continue; } status_ads[i]->LookupInteger( ATTR_CLUSTER_ID, cluster ); status_ads[i]->LookupInteger( ATTR_PROC_ID, proc ); formatstr( job_id_string, "condor %s %s %d.%d", scheddName, poolName, cluster, proc ); rc2 = BaseJob::JobsByRemoteId.lookup( HashKey( job_id_string.c_str() ), base_job ); job = dynamic_cast<CondorJob*>( base_job ); if ( rc2 == 0 ) { job->NotifyNewRemoteStatus( status_ads[i] ); poll_info->m_submittedJobs.Delete( job ); } else { delete status_ads[i]; } } poll_info->m_lastPoll = time(NULL); } poll_info->m_pollActive = false; if ( status_ads != NULL ) { free( status_ads ); } // Check if any jobs were missing from the status result if ( rc == 0 ) { CondorJob *job; std::string job_id; poll_info->m_submittedJobs.Rewind(); while ( ( job = poll_info->m_submittedJobs.Next() ) ) { if ( job->jobAd->LookupString( ATTR_GRID_JOB_ID, job_id ) ) { // We should have gotten a status ad for this job, // but didn't. Tell the job that there may be // something wrong by giving it a NULL status ad. job->NotifyNewRemoteStatus( NULL ); } poll_info->m_submittedJobs.DeleteCurrent(); } } scheddStatusActive = false; dprintf( D_FULLDEBUG, "Collective poll complete: %s\n", scheddName ); daemonCore->Reset_Timer( scheddPollTid, BatchStatusInterval() ); } }
int BaseResource::DoBatchStatus() { dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s.\n", ResourceName()); if ( ( registeredJobs.IsEmpty() || resourceDown ) && m_batchStatusActive == false ) { // No jobs or we can't talk to the schedd, so no point // in polling daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() ); dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s skipped for %d seconds because %s.\n", ResourceName(), BatchStatusInterval(), resourceDown ? "the resource is down":"there are no jobs registered"); return 0; } GahpClient * gahp = BatchGahp(); if ( gahp && gahp->isStarted() == false ) { int GAHP_INIT_DELAY = 5; dprintf( D_ALWAYS,"BaseResource::DoBatchStatus: gahp server not up yet, delaying %d seconds\n", GAHP_INIT_DELAY ); daemonCore->Reset_Timer( m_batchPollTid, GAHP_INIT_DELAY ); return 0; } daemonCore->Reset_Timer( m_batchPollTid, TIMER_NEVER ); if(m_batchStatusActive == false) { dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Starting bulk job poll of %s\n", ResourceName()); BatchStatusResult bsr = StartBatchStatus(); switch(bsr) { case BSR_DONE: dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName()); daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() ); return 0; case BSR_ERROR: dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to start a bulk poll of %s\n", ResourceName()); daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() ); return 0; case BSR_PENDING: m_batchStatusActive = true; return 0; default: EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d", (int)bsr); } } else { BatchStatusResult bsr = FinishBatchStatus(); switch(bsr) { case BSR_DONE: dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName()); m_batchStatusActive = false; daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() ); return 0; case BSR_ERROR: dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to finish a bulk poll of %s\n", ResourceName()); m_batchStatusActive = false; daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() ); return 0; case BSR_PENDING: return 0; default: EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d", (int)bsr); } } return 0; }