예제 #1
0
int BaseResource::DoBatchStatus()
{
	dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s.\n", ResourceName());

	if ( ( registeredJobs.IsEmpty() || resourceDown ) &&
		 m_batchStatusActive == false ) {
			// No jobs or we can't talk to the schedd, so no point
			// in polling
		daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
		dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus for %s skipped for %d seconds because %s.\n", ResourceName(), BatchStatusInterval(), resourceDown ? "the resource is down":"there are no jobs registered");
		return 0;
	}

	GahpClient * gahp = BatchGahp();
	if ( gahp && gahp->isStarted() == false ) {
		int GAHP_INIT_DELAY = 5;
		dprintf( D_ALWAYS,"BaseResource::DoBatchStatus: gahp server not up yet, delaying %d seconds\n", GAHP_INIT_DELAY );
		daemonCore->Reset_Timer( m_batchPollTid, GAHP_INIT_DELAY );
		return 0;
	}

	daemonCore->Reset_Timer( m_batchPollTid, TIMER_NEVER );

	if(m_batchStatusActive == false) {
		dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Starting bulk job poll of %s\n", ResourceName());
		BatchStatusResult bsr = StartBatchStatus();
		switch(bsr) {
			case BSR_DONE:
				dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName());
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_ERROR:
				dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to start a bulk poll of %s\n", ResourceName());
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_PENDING:
				m_batchStatusActive = true;
				return 0;

			default:
				EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d\n", (int)bsr);
		}

	} else {
		BatchStatusResult bsr = FinishBatchStatus();
		switch(bsr) {
			case BSR_DONE:
				dprintf(D_FULLDEBUG, "BaseResource::DoBatchStatus: Finished bulk job poll of %s\n", ResourceName());
				m_batchStatusActive = false;
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_ERROR:
				dprintf(D_ALWAYS, "BaseResource::DoBatchStatus: An error occurred trying to finish a bulk poll of %s\n", ResourceName());
				m_batchStatusActive = false;
				daemonCore->Reset_Timer( m_batchPollTid, BatchStatusInterval() );
				return 0;

			case BSR_PENDING:
				return 0;

			default:
				EXCEPT("BaseResource::DoBatchStatus: Unknown BatchStatusResult %d\n", (int)bsr);
		}
	}
	return 0;
}
예제 #2
0
void
StatusClient::hookExited(int exit_status)
{
	std::string key = m_routed_job->dest_key;
	if (false == JobRouterHookMgr::removeKnownHook(key.c_str(), HOOK_UPDATE_JOB_INFO))
	{
		dprintf(D_ALWAYS|D_FAILURE, "StatusClient::hookExited (%s):"
			"Failed to remove hook info for job key %s.\n", 
			m_routed_job->JobDesc().c_str(), key.c_str());
		EXCEPT("StatusClient::hookExited: Received exit notification "
			"for job with key %s, which isn't a key for a job "
			"known to have a status hook running.", key.c_str());
		return;
	}

	HookClient::hookExited(exit_status);

	if (m_std_err.Length())
	{
		dprintf(D_ALWAYS, "StatusClient::hookExited (%s): Warning, "
				"hook %s (pid %d) printed to stderr: %s\n",
				m_routed_job->JobDesc().c_str(), m_hook_path,
				(int)m_pid, m_std_err.Value());
	}
	if (m_std_out.Length() && 0 == WEXITSTATUS(exit_status))
	{
		ClassAd job_ad;
		const char* hook_line = NULL;
		const char* attrs_to_delete[] = {
			ATTR_MY_TYPE,
			ATTR_TARGET_TYPE,
			NULL };

		m_std_out.Tokenize();
		while ((hook_line = m_std_out.GetNextToken("\n", true)))
		{
			if (!job_ad.Insert(hook_line))
			{
				dprintf(D_ALWAYS, "StatusClient::hookExited (%s): "
						"Failed to insert \"%s\" into "
						"ClassAd, ignoring invalid "
						"hook output.  Job status NOT "
						"updated.\n", m_routed_job->JobDesc().c_str(), hook_line);
				job_router->GracefullyRemoveJob(m_routed_job);
				return;
			}
		}
		// Delete attributes that may have been returned by the hook
		// but should not be included in the update
		for (int index = 0; attrs_to_delete[index] != NULL; ++index)
		{
			job_ad.Delete(attrs_to_delete[index]);
		}
		job_router->UpdateRoutedJobStatus(m_routed_job, job_ad);
	}
	else
	{
		if (0 == WEXITSTATUS(exit_status))
		{
			dprintf(D_FULLDEBUG, "StatusClient::hookExited (%s): "
					"Hook %s (pid %d) returned no data.\n",
					m_routed_job->JobDesc().c_str(),
					m_hook_path, (int)m_pid);
                	job_router->FinishCheckSubmittedJobStatus(m_routed_job);
		}
		else
		{
			dprintf(D_FULLDEBUG, "StatusClient::hookExited (%s): "						"Hook %s (pid %d) exited with return "
					"status %d.  Ignoring output.\n", 
					m_routed_job->JobDesc().c_str(),
					m_hook_path, (int)m_pid,
					(int)WEXITSTATUS(exit_status));
		}
	}
}
예제 #3
0
int
pseudo_ulog( ClassAd *ad )
{
	ULogEvent *event = instantiateEvent(ad);
	int result = 0;
	char const *critical_error = NULL;
	MyString CriticalErrorBuf;
	bool event_already_logged = false;
	bool put_job_on_hold = false;
	char const *hold_reason = NULL;
	char *hold_reason_buf = NULL;
	int hold_reason_code = 0;
	int hold_reason_sub_code = 0;

	if(!event) {
		MyString add_str;
		sPrintAd(add_str, *ad);
		dprintf(
		  D_ALWAYS,
		  "invalid event ClassAd in pseudo_ulog: %s\n",
		  add_str.Value());
		return -1;
	}

	if(ad->LookupInteger(ATTR_HOLD_REASON_CODE,hold_reason_code)) {
		put_job_on_hold = true;
		ad->LookupInteger(ATTR_HOLD_REASON_SUBCODE,hold_reason_sub_code);
		ad->LookupString(ATTR_HOLD_REASON,&hold_reason_buf);
		if(hold_reason_buf) {
			hold_reason = hold_reason_buf;
		}
	}

	if( event->eventNumber == ULOG_REMOTE_ERROR ) {
		RemoteErrorEvent *err = (RemoteErrorEvent *)event;

		if(!err->getExecuteHost() || !*err->getExecuteHost()) {
			//Insert remote host information.
			char *execute_host = NULL;
			thisRemoteResource->getMachineName(execute_host);
			err->setExecuteHost(execute_host);
			delete[] execute_host;
		}

		if(err->isCriticalError()) {
			CriticalErrorBuf.formatstr(
			  "Error from %s: %s",
			  err->getExecuteHost(),
			  err->getErrorText());

			critical_error = CriticalErrorBuf.Value();
			if(!hold_reason) {
				hold_reason = critical_error;
			}

			//Temporary: the following causes critical remote errors
			//to be logged as ShadowExceptionEvents, rather than
			//RemoteErrorEvents.  The result is ugly, but guaranteed to
			//be compatible with other user-log reading tools.
			BaseShadow::log_except(critical_error);
			event_already_logged = true;
		}
	}

	if( !event_already_logged && !Shadow->uLog.writeEvent( event, ad ) ) {
		MyString add_str;
		sPrintAd(add_str, *ad);
		dprintf(
		  D_ALWAYS,
		  "unable to log event in pseudo_ulog: %s\n",
		  add_str.Value());
		result = -1;
	}

	if(put_job_on_hold) {
		hold_reason = critical_error;
		if(!hold_reason) {
			hold_reason = "Job put on hold by remote host.";
		}
		Shadow->holdJobAndExit(hold_reason,hold_reason_code,hold_reason_sub_code);
		//should never get here, because holdJobAndExit() exits.
	}

	if( critical_error ) {
		//Suppress ugly "Shadow exception!"
		Shadow->exception_already_logged = true;

		//lame: at the time of this writing, EXCEPT does not want const:
		EXCEPT("%s", critical_error);
	}

	delete event;
	return result;
}
ClassAd *CollectorEngine::
collect (int command,ClassAd *clientAd,const condor_sockaddr& from,int &insert,Sock *sock)
{
	ClassAd		*retVal;
	ClassAd		*pvtAd;
	int		insPvt;
	AdNameHashKey		hk;
	HashString	hashString;
	static int repeatStartdAds = -1;		// for debugging
	ClassAd		*clientAdToRepeat = NULL;

	if (repeatStartdAds == -1) {
		repeatStartdAds = param_integer("COLLECTOR_REPEAT_STARTD_ADS",0);
	}

	if( !ValidateClassAd(command,clientAd,sock) ) {
		return NULL;
	}

	// mux on command
	switch (command)
	{
	  case UPDATE_STARTD_AD:
	  case UPDATE_STARTD_AD_WITH_ACK:
#if !defined(WANT_OLD_CLASSADS)
		  clientAd->AddTargetRefs( TargetJobAttrs );
#endif
		if ( repeatStartdAds > 0 ) {
			clientAdToRepeat = new ClassAd(*clientAd);
		}
		if (!makeStartdAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (StartdAds, "StartdAd     ", "Start",
							  clientAd, hk, hashString, insert, from );

		// if we want to store private ads
		if (!sock)
		{
			dprintf (D_ALWAYS, "Want private ads, but no socket given!\n");
			break;
		}
		else
		{
			if (!(pvtAd = new ClassAd))
			{
				EXCEPT ("Memory error!");
			}
			if( !pvtAd->initFromStream(*sock) )
			{
				dprintf(D_FULLDEBUG,"\t(Could not get startd's private ad)\n");
				delete pvtAd;
				break;
			}

				// Fix up some stuff in the private ad that we depend on.
				// We started doing this in 7.2.0, so once we no longer
				// care about compatibility with stuff from before then,
				// the startd could stop bothering to send these attributes.

				// Queries of private ads depend on the following:
			pvtAd->SetMyTypeName( STARTD_ADTYPE );

				// Negotiator matches up private ad with public ad by
				// using the following.
			if( retVal ) {
				pvtAd->CopyAttribute( ATTR_MY_ADDRESS, retVal );
				pvtAd->CopyAttribute( ATTR_NAME, retVal );
			}


			// insert the private ad into its hashtable --- use the same
			// hash key as the public ad
			(void) updateClassAd (StartdPrivateAds, "StartdPvtAd  ",
								  "StartdPvt", pvtAd, hk, hashString, insPvt,
								  from );
		}

		// create fake duplicates of this ad, each with a different name, if
		// we are told to do so.  this feature exists for developer
		// scalability testing.
		if ( repeatStartdAds > 0 && clientAdToRepeat ) {
			ClassAd *fakeAd;
			int n;
			char newname[150],oldname[130];
			oldname[0] = '\0';
			clientAdToRepeat->LookupString("Name",oldname,sizeof(oldname));
			for (n=0;n<repeatStartdAds;n++) {
				fakeAd = new ClassAd(*clientAdToRepeat);
				snprintf(newname,sizeof(newname),
						 "Name=\"fake%d-%s\"",n,oldname);
				fakeAd->InsertOrUpdate(newname);
				makeStartdAdHashKey (hk, fakeAd);
				hashString.Build( hk );
				if (! updateClassAd (StartdAds, "StartdAd     ", "Start",
							  fakeAd, hk, hashString, insert, from ) )
				{
					// don't leak memory if there is some failure
					delete fakeAd;
				}
			}
			delete clientAdToRepeat;
			clientAdToRepeat = NULL;
		}
		break;

	  case MERGE_STARTD_AD:
#if !defined(WANT_OLD_CLASSADS)
		  clientAd->AddTargetRefs( TargetJobAttrs );
#endif
		if (!makeStartdAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=mergeClassAd (StartdAds, "StartdAd     ", "Start",
							  clientAd, hk, hashString, insert, from );
		break;

#ifdef HAVE_EXT_POSTGRESQL
	  case UPDATE_QUILL_AD:
		if (!makeQuillAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (QuillAds, "QuillAd     ", "Quill",
							  clientAd, hk, hashString, insert, from );
		break;
#endif /* HAVE_EXT_POSTGRESQL */

	  case UPDATE_SCHEDD_AD:
		if (!makeScheddAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (ScheddAds, "ScheddAd     ", "Schedd",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_SUBMITTOR_AD:
		// use the same hashkey function as a schedd ad
		if (!makeScheddAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		// CRUFT: Before 7.3.2, submitter ads had a MyType of
		//   "Scheduler". The only way to tell the difference
		//   was that submitter ads didn't have ATTR_NUM_USERS.
		//   Coerce MyStype to "Submitter" for ads coming from
		//   these older schedds.
		//   Before 7.7.3, submitter ads for parallel universe
		//   jobs had a MyType of "Scheduler".
		clientAd->SetMyTypeName( SUBMITTER_ADTYPE );
		// since submittor ads always follow a schedd ad, and a master check is
		// performed for schedd ads, we don't need a master check in here
		hashString.Build( hk );
		retVal=updateClassAd (SubmittorAds, "SubmittorAd  ", "Submittor",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_LICENSE_AD:
		// use the same hashkey function as a schedd ad
		if (!makeLicenseAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		// since submittor ads always follow a schedd ad, and a master check is
		// performed for schedd ads, we don't need a master check in here
		hashString.Build( hk );
		retVal=updateClassAd (LicenseAds, "LicenseAd  ", "License",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_MASTER_AD:
		if (!makeMasterAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (MasterAds, "MasterAd     ", "Master",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_CKPT_SRVR_AD:
		if (!makeCkptSrvrAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (CkptServerAds, "CkptSrvrAd   ", "CkptSrvr",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_COLLECTOR_AD:
		if (!makeCollectorAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (CollectorAds, "CollectorAd  ", "Collector",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_STORAGE_AD:
		if (!makeStorageAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (StorageAds, "StorageAd  ", "Storage",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_NEGOTIATOR_AD:
		if (!makeNegotiatorAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
			// first, purge all the existing negotiator ads, since we
			// want to enforce that *ONLY* 1 negotiator is in the
			// collector any given time.
		purgeHashTable( NegotiatorAds );
		retVal=updateClassAd (NegotiatorAds, "NegotiatorAd  ", "Negotiator",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_HAD_AD:
		if (!makeHadAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (HadAds, "HadAd  ", "HAD",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_GRID_AD:
		if (!makeGridAdHashKey(hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (GridAds, "GridAd  ", "Grid",
							  clientAd, hk, hashString, insert, from );
          break;

	  case UPDATE_AD_GENERIC:
	  {
		  const char *type_str = clientAd->GetMyTypeName();
		  if (type_str == NULL) {
			  dprintf(D_ALWAYS, "collect: UPDATE_AD_GENERIC: ad has no type\n");
			  insert = -3;
			  retVal = 0;
			  break;
		  }
		  MyString type(type_str);
		  CollectorHashTable *cht = findOrCreateTable(type);
		  if (cht == NULL) {
			  dprintf(D_ALWAYS, "collect: findOrCreateTable failed\n");
			  insert = -3;
			  retVal = 0;
			  break;
		  }
		  if (!makeGenericAdHashKey (hk, clientAd))
		  {
			  dprintf(D_ALWAYS, "Could not make haskey --- ignoring ad\n");
			  insert = -3;
			  retVal = 0;
			  break;
		  }
		  hashString.Build(hk);
		  retVal = updateClassAd(*cht, type_str, type_str, clientAd,
					 hk, hashString, insert, from);
		  break;
	  }

	  case UPDATE_XFER_SERVICE_AD:
		if (!makeXferServiceAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
		retVal=updateClassAd (XferServiceAds, "XferServiceAd  ",
							  "XferService",
							  clientAd, hk, hashString, insert, from );
		break;

	  case UPDATE_LEASE_MANAGER_AD:
		if (!makeLeaseManagerAdHashKey (hk, clientAd))
		{
			dprintf (D_ALWAYS, "Could not make hashkey --- ignoring ad\n");
			insert = -3;
			retVal = 0;
			break;
		}
		hashString.Build( hk );
			// first, purge all the existing LeaseManager ads, since we
			// want to enforce that *ONLY* 1 manager is in the
			// collector any given time.
		purgeHashTable( LeaseManagerAds );
		retVal=updateClassAd (LeaseManagerAds, "LeaseManagerAd  ",
							  "LeaseManager",
							  clientAd, hk, hashString, insert, from );
		break;


	  case QUERY_STARTD_ADS:
	  case QUERY_SCHEDD_ADS:
	  case QUERY_MASTER_ADS:
	  case QUERY_GATEWAY_ADS:
	  case QUERY_SUBMITTOR_ADS:
	  case QUERY_CKPT_SRVR_ADS:
	  case QUERY_STARTD_PVT_ADS:
	  case QUERY_COLLECTOR_ADS:
  	  case QUERY_NEGOTIATOR_ADS:
  	  case QUERY_HAD_ADS:
  	  case QUERY_XFER_SERVICE_ADS:
  	  case QUERY_LEASE_MANAGER_ADS:
	  case QUERY_GENERIC_ADS:
	  case INVALIDATE_STARTD_ADS:
	  case INVALIDATE_SCHEDD_ADS:
	  case INVALIDATE_MASTER_ADS:
	  case INVALIDATE_GATEWAY_ADS:
	  case INVALIDATE_CKPT_SRVR_ADS:
	  case INVALIDATE_SUBMITTOR_ADS:
	  case INVALIDATE_COLLECTOR_ADS:
	  case INVALIDATE_NEGOTIATOR_ADS:
	  case INVALIDATE_HAD_ADS:
	  case INVALIDATE_XFER_SERVICE_ADS:
	  case INVALIDATE_LEASE_MANAGER_ADS:
	  case INVALIDATE_ADS_GENERIC:
		// these are not implemented in the engine, but we allow another
		// daemon to detect that these commands have been given
	    insert = -2;
		retVal = 0;
	    break;

	  default:
		dprintf (D_ALWAYS, "Received illegal command: %d\n", command);
		insert = -1;
		retVal = 0;
	}

	// return the updated ad
	return retVal;
}
void
OfflineCollectorPlugin::configure ()
{

	dprintf (
		D_FULLDEBUG,
		"In OfflineCollectorPlugin::configure ()\n" );

	/**** Handle ABSENT_REQUIREMENTS PARAM ****/
	char *tmp;
	ExprTree *tmp_expr = 0 ;

	if (AbsentReq) delete AbsentReq;
	AbsentReq = NULL;
	tmp = param("ABSENT_REQUIREMENTS");
	if( tmp ) {
		if( ParseClassAdRvalExpr(tmp, AbsentReq) ) {
			EXCEPT ("Error parsing ABSENT_REQUIREMENTS expression: %s",
					tmp);
		}
#if !defined(WANT_OLD_CLASSADS)
		if(AbsentReq){
			tmp_expr = AddTargetRefs( AbsentReq, TargetMachineAttrs );
			delete AbsentReq;
		}
		AbsentReq = tmp_expr;
#endif
		dprintf (D_ALWAYS,"ABSENT_REQUIREMENTS = %s\n", tmp);
		free( tmp );
		tmp = NULL;
	} else {
		dprintf (D_ALWAYS,"ABSENT_REQUIREMENTS = None\n");
	}


	/**** Handle COLLECTOR_PERSISTANT_AD_LOG PARAM ****/

	if ( _persistent_store ) {
		/* was param()'d so we must use free() */
		free ( _persistent_store );
		_persistent_store = NULL;
	}

	_persistent_store = param("COLLECTOR_PERSISTENT_AD_LOG");
	// if not found, try depreciated name OFFLINE_LOG
	if ( ! _persistent_store ) {
		_persistent_store = param ( 
		"OFFLINE_LOG" );
	}

	if ( _persistent_store ) {

		dprintf (
			D_ALWAYS,
			"OfflineCollectorPlugin::configure: off-line ad "
			"persistent store: '%s'.\n",
			_persistent_store );

		if ( _ads ) {
			delete _ads;
			_ads = NULL;
		}

		_ads = new ClassAdCollection ( _persistent_store, 2 );
		ASSERT ( _ads );

	} else {

		dprintf (
			D_ALWAYS,
			"OfflineCollectorPlugin::configure: no persistent store "
			"was defined for off-line ads.\n" );

	}

}
예제 #6
0
파일: statsd.cpp 프로젝트: blueskyll/condor
void
StatsD::initAndReconfig(char const *service_name)
{
    std::string param_name;

    ASSERT( service_name );

    m_default_metric_ad.Clear();

    int old_stats_pub_interval = m_stats_pub_interval;
    formatstr(param_name,"%s_INTERVAL",service_name);
    m_stats_pub_interval = param_integer(param_name.c_str(),60);
    if( m_stats_pub_interval < 0 ) {
        dprintf(D_ALWAYS,
                "%s is less than 0, so no stats publications will be made.\n",
                param_name.c_str());
        if( m_stats_pub_timer != -1 ) {
            daemonCore->Cancel_Timer(m_stats_pub_timer);
            m_stats_pub_timer = -1;
        }
    }
    else if( m_stats_pub_timer >= 0 ) {
        if( old_stats_pub_interval != m_stats_pub_interval ) {
            m_stats_time_till_pub = m_stats_time_till_pub + (m_stats_pub_interval - old_stats_pub_interval );
        }
    }
    else {
        m_stats_heartbeat_interval = MIN(m_stats_pub_interval,m_stats_heartbeat_interval);
        m_stats_pub_timer = daemonCore->Register_Timer(
                                m_stats_heartbeat_interval,
                                m_stats_heartbeat_interval,
                                (TimerHandlercpp)&StatsD::publishMetrics,
                                "Statsd::publishMetrics",
                                this );
    }
    if( old_stats_pub_interval != m_stats_pub_interval && m_stats_pub_interval > 0 )
    {
        dprintf(D_ALWAYS,
                "Will perform stats publication every %s=%d "
                "seconds.\n", param_name.c_str(),m_stats_pub_interval);
    }

    formatstr(param_name,"%s_VERBOSITY",service_name);
    m_verbosity = param_integer(param_name.c_str(),0);

    formatstr(param_name,"%s_REQUIREMENTS",service_name);
    param(m_requirements,param_name.c_str());

    formatstr(param_name,"%s_PER_EXECUTE_NODE_METRICS",service_name);
    m_per_execute_node_metrics = param_boolean(param_name.c_str(),true);

    formatstr(param_name,"%s_DEFAULT_CLUSTER",service_name);
    std::string default_cluster_expr;
    param(default_cluster_expr,param_name.c_str());

    if( !default_cluster_expr.empty() ) {
        classad::ClassAdParser parser;
        classad::ExprTree *expr=NULL;
        if( !parser.ParseExpression(default_cluster_expr,expr,true) || !expr ) {
            EXCEPT("Invalid %s=%s",param_name.c_str(),default_cluster_expr.c_str());
        }
        // The classad takes ownership of expr
        m_default_metric_ad.Insert(ATTR_CLUSTER,expr);
    }

    formatstr(param_name,"%s_DEFAULT_MACHINE",service_name);
    std::string default_machine_expr;
    param(default_machine_expr,param_name.c_str());

    if( !default_machine_expr.empty() ) {
        classad::ClassAdParser parser;
        classad::ExprTree *expr=NULL;
        if( !parser.ParseExpression(default_machine_expr,expr,true) || !expr ) {
            EXCEPT("Invalid %s=%s",param_name.c_str(),default_machine_expr.c_str());
        }
        // The classad takes ownership of expr
        m_default_metric_ad.Insert(ATTR_MACHINE,expr);
    }

    formatstr(param_name,"%s_DEFAULT_IP",service_name);
    std::string default_ip_expr;
    param(default_ip_expr,param_name.c_str());

    if( !default_ip_expr.empty() ) {
        classad::ClassAdParser parser;
        classad::ExprTree *expr=NULL;
        if( !parser.ParseExpression(default_ip_expr,expr,true) || !expr ) {
            EXCEPT("Invalid %s=%s",param_name.c_str(),default_ip_expr.c_str());
        }
        // The classad takes ownership of expr
        m_default_metric_ad.Insert(ATTR_IP,expr);
    }

    clearMetricDefinitions();
    std::string config_dir;
    formatstr(param_name,"%s_METRICS_CONFIG_DIR",service_name);
    param(config_dir,param_name.c_str());
    if( !config_dir.empty() ) {
        StringList file_list;
        if( !get_config_dir_file_list( config_dir.c_str(), file_list ) ) {
            EXCEPT("Failed to read metric configuration from %s\n",config_dir.c_str());
        }

        file_list.rewind();
        char const *fname;
        while( (fname=file_list.next()) ) {
            dprintf(D_ALWAYS,"Reading metric definitions from %s\n",fname);
            ParseMetricsFromFile(fname);
        }
    }
}
예제 #7
0
/* Mess up the in memory job ad with interesting statistics about suspensions */
void record_suspension_hack(unsigned int action)
{
	char tmp[256];
	int total_suspensions;
	int last_suspension_time;
	int cumulative_suspension_time;
	extern char *schedd;

	if (!JobAd)
	{
		EXCEPT("Suspension code: Non-existant JobAd");
	}

	switch(action)
	{
		case ULOG_JOB_SUSPENDED:
			/* Add to ad number of suspensions */
			JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions);
			total_suspensions++;
			sprintf(tmp, "%s = %d", ATTR_TOTAL_SUSPENSIONS, total_suspensions);
			JobAd->Insert(tmp);

			/* Add to ad the current suspension time */
			last_suspension_time = time(NULL);
			sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, 
				last_suspension_time);
			JobAd->Insert(tmp);
			break;
		case ULOG_JOB_UNSUSPENDED: {
			/* add in the time I spent suspended to a running total */
			JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME,
				cumulative_suspension_time);
			JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME,
				last_suspension_time);
			int delta = time(NULL) - last_suspension_time;
			cumulative_suspension_time += delta;
			sprintf(tmp, "%s = %d", ATTR_CUMULATIVE_SUSPENSION_TIME,
				cumulative_suspension_time);
			JobAd->Insert(tmp);

			int uncommitted_suspension_time = 0;
			JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME,
								 uncommitted_suspension_time);
			uncommitted_suspension_time += delta;
			JobAd->Assign(ATTR_UNCOMMITTED_SUSPENSION_TIME,uncommitted_suspension_time);

			/* set the current suspension time to zero, meaning not suspended */
			last_suspension_time = 0;
			sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, 
				last_suspension_time);
			JobAd->Insert(tmp);
			break;
		}
		default:
			EXCEPT("record_suspension_hack(): Action event not recognized.");
			break;
	}

	/* Sanity output */
	JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions);
	dprintf(D_FULLDEBUG,"%s = %d\n", ATTR_TOTAL_SUSPENSIONS, total_suspensions);
	JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_suspension_time);
	dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_LAST_SUSPENSION_TIME,
		last_suspension_time);
	JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, 
		cumulative_suspension_time);
	dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_CUMULATIVE_SUSPENSION_TIME,
		cumulative_suspension_time);
	
	/* If we've been asked to perform real time updates of the suspension
		information, then connect to the queue and do it here. */
	if (param_boolean("REAL_TIME_JOB_SUSPEND_UPDATES", false))
	{
			dprintf( D_ALWAYS, "Updating suspension info to schedd.\n" );
			if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT)) {
				/* Since these attributes aren't updated periodically, if
					the schedd is busy and a resume event update is lost,
					the the job will be marked suspended when it really isn't.
					The new shadow eventually corrects this via a periodic
					update of various calssad attributes, but I 
					suspect it won't be corrected in the event of a
					bad connect here for this shadow. */
				dprintf( D_ALWAYS, 
					"Timeout connecting to schedd. Suspension update lost.\n");
				return;
			}

        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_TOTAL_SUSPENSIONS, total_suspensions);
        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time);
        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_LAST_SUSPENSION_TIME, last_suspension_time);

			DisconnectQ(NULL);
	}
}
예제 #8
0
EC2Resource::BatchStatusResult EC2Resource::StartBatchStatus() {
    ASSERT( status_gahp );

    // m_checkSpotNext starts out false
    if( ! m_checkSpotNext ) {
        StringList returnStatus;
        std::string errorCode;
        int rc = status_gahp->ec2_vm_status_all( resourceName,
                    m_public_key_file, m_private_key_file,
                    returnStatus, errorCode );

        if( rc == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; }
    
        if( rc != 0 ) {
            std::string errorString = status_gahp->getErrorString();
            dprintf( D_ALWAYS, "Error doing batched EC2 status query: %s: %s.\n",
                     errorCode.c_str(), errorString.c_str() );
            return BSR_ERROR;
        }

        //
        // We have to let a job know if we can't find a status report for it.
        //
        List<EC2Job> myJobs;
        EC2Job * nextJob = NULL;
		BaseJob *nextBaseJob = NULL;
		registeredJobs.Rewind();
		while ( (nextBaseJob = registeredJobs.Next()) ) {
			nextJob = dynamic_cast< EC2Job * >( nextBaseJob );
			ASSERT( nextJob );
			if ( !nextJob->m_client_token.empty() ) {
				myJobs.Append( nextJob );
			}
		}

        returnStatus.rewind();
        ASSERT( returnStatus.number() % 6 == 0 );
        for( int i = 0; i < returnStatus.number(); i += 6 ) {
            std::string instanceID = returnStatus.next();
            std::string status = returnStatus.next();
            std::string clientToken = returnStatus.next();
            std::string keyName = returnStatus.next();
            std::string stateReasonCode = returnStatus.next();
            std::string publicDNSName = returnStatus.next();

            // Efficiency suggests we look via the instance ID first,
            // and then try to look things up via the client token
            // (or, for GT #3682, via the keypair ID).

            // We can't use BaseJob::JobsByRemoteId because OpenStack doesn't
            // include the client token in its status responses, and therefore
            // we can't always fully reconstruct the remoteJobID used as the key.
            EC2Job * job = NULL;
            rc = jobsByInstanceID.lookup( HashKey( instanceID.c_str() ), job );
            if( rc == 0 ) {
                ASSERT( job );
        
                dprintf( D_FULLDEBUG, "Found job object for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
                job->StatusUpdate( instanceID.c_str(), status.c_str(),
                                   stateReasonCode.c_str(), publicDNSName.c_str() );
                myJobs.Delete( job );
                continue;
            }

            // If we got a client token, use that to look up the job.  We
            // don't use the instance ID because we may discover it in
            // this function.  Since we need instance ID -based dispatch
            // code for OpenStack anyway, we'll just use it, rather than
            // trying the remoteJobID with the instance ID if we don't
            // find it using only the client token.
            if( ! clientToken.empty() && clientToken != "NULL" ) {
                std::string remoteJobID;
                formatstr( remoteJobID, "ec2 %s %s", resourceName, clientToken.c_str() );
                
                BaseJob * tmp = NULL;
                rc = BaseJob::JobsByRemoteId.lookup( HashKey( remoteJobID.c_str() ), tmp );
                
                if( rc == 0 ) {
                    ASSERT( tmp );
                    EC2Job * job = dynamic_cast< EC2Job * >( tmp );
                    if( job == NULL ) {
                        EXCEPT( "Found non-EC2Job identified by '%s'.", remoteJobID.c_str() );
                    }
                    
                    dprintf( D_FULLDEBUG, "Found job object via client token for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
                    job->StatusUpdate( instanceID.c_str(), status.c_str(),
                                       stateReasonCode.c_str(), publicDNSName.c_str() );
                    myJobs.Delete( job );
                    continue;
                }
            }
            
			// Some servers (OpenStack, Eucalyptus) silently ignore client
			// tokens. So we need to use the ssh keypair to find jobs that
			// were submitted but which we don't have an instance ID for.
			//
			// TODO This code should be made more efficient. We can
			//   do something better than a linear scan through all
			//   jobs for each status result. Ideally, we'd parse the
			//   ssh keypair name and if it looks like one we generated,
			//   pluck out the job id.
			if ( !ClientTokenWorks() && !keyName.empty() && keyName != "NULL" ) {
				myJobs.Rewind();
				while ( ( job = myJobs.Next() ) ) {
					if ( job->m_key_pair == keyName ) {
						dprintf( D_FULLDEBUG, "Found job object via ssh keypair for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
						job->StatusUpdate( instanceID.c_str(), status.c_str(),
										   stateReasonCode.c_str(),
										   publicDNSName.c_str() );
						myJobs.Delete( job );
						continue;
					}
				}
			}

            dprintf( D_FULLDEBUG, "Found unknown instance '%s'; skipping.\n", instanceID.c_str() );
            continue;
        }
    
        myJobs.Rewind();
        while( ( nextJob = myJobs.Next() ) ) {
            dprintf( D_FULLDEBUG, "Informing job %p it got no status.\n", nextJob );
            nextJob->StatusUpdate( NULL, NULL, NULL, NULL );
        }
    
        // Don't ask for spot results unless we know about a spot job.  This
        // should prevent us from breaking OpenStack.
        if( spotJobsByRequestID.getNumElements() == 0 ) {
            m_checkSpotNext = false;
            return BSR_DONE;
        } else {
            m_checkSpotNext = true;
        }
    }
    
    if( m_checkSpotNext ) {
        StringList spotReturnStatus;
        std::string spotErrorCode;
        int spotRC = status_gahp->ec2_spot_status_all( resourceName,
                        m_public_key_file, m_private_key_file,
                        spotReturnStatus, spotErrorCode );

        if( spotRC == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; }

        if( spotRC != 0 ) {
            std::string errorString = status_gahp->getErrorString();
            dprintf( D_ALWAYS, "Error doing batched EC2 spot status query: %s: %s.\n",
                     spotErrorCode.c_str(), errorString.c_str() );
            return BSR_ERROR;
        }

        List<EC2Job> mySpotJobs;
        EC2Job * nextSpotJob = NULL;
        spotJobsByRequestID.startIterations();
        while( spotJobsByRequestID.iterate( nextSpotJob ) ) {
            mySpotJobs.Append( nextSpotJob );
        }
    
        spotReturnStatus.rewind();
        ASSERT( spotReturnStatus.number() % 5 == 0 );
        for( int i = 0; i < spotReturnStatus.number(); i += 5 ) {
            std::string requestID = spotReturnStatus.next();
            std::string state = spotReturnStatus.next();
            /* std::string launchGroup = */ spotReturnStatus.next();
            /* std::string instanceID = */ spotReturnStatus.next();
            std::string statusCode = spotReturnStatus.next();
            
            EC2Job * spotJob = NULL;
            spotRC = spotJobsByRequestID.lookup( HashKey( requestID.c_str() ), spotJob );
            if( spotRC != 0 ) {
                dprintf( D_FULLDEBUG, "Found unknown spot request '%s'; skipping.\n", requestID.c_str() );
                continue;
            }
            ASSERT( spotJob );

            if( ! statusCode.empty() ) { state = statusCode; }

            dprintf( D_FULLDEBUG, "Found spot job object for '%s', updating status ('%s').\n", requestID.c_str(), state.c_str() );
            spotJob->StatusUpdate( NULL, state.c_str(), NULL, NULL );
            mySpotJobs.Delete( spotJob );
        }

        mySpotJobs.Rewind();
        while( ( nextSpotJob = mySpotJobs.Next() ) ) {
            dprintf( D_FULLDEBUG, "Informing spot job %p it got no status.\n", nextSpotJob );
            nextSpotJob->StatusUpdate( NULL, NULL, NULL, NULL );
        }
        
        m_checkSpotNext = false;
        return BSR_DONE;
    }

    // This should never happen (but the compiler hates you).
    return BSR_ERROR;
}
예제 #9
0
int
OsProc::StartJob(FamilyInfo* family_info, NetworkNamespaceManager * network_manager = NULL, FilesystemRemap* fs_remap=NULL)
{
	int nice_inc = 0;
	bool has_wrapper = false;

	dprintf(D_FULLDEBUG,"in OsProc::StartJob()\n");

	if ( !JobAd ) {
		dprintf ( D_ALWAYS, "No JobAd in OsProc::StartJob()!\n" );
		return 0;
	}

	MyString JobName;
	if ( JobAd->LookupString( ATTR_JOB_CMD, JobName ) != 1 ) {
		dprintf( D_ALWAYS, "%s not found in JobAd.  Aborting StartJob.\n", 
				 ATTR_JOB_CMD );
		return 0;
	}

	const char* job_iwd = Starter->jic->jobRemoteIWD();
	dprintf( D_ALWAYS, "IWD: %s\n", job_iwd );

		// some operations below will require a PrivSepHelper if
		// PrivSep is enabled (if it's not, privsep_helper will be
		// NULL)
	PrivSepHelper* privsep_helper = Starter->privSepHelper();

		// // // // // // 
		// Arguments
		// // // // // // 

		// prepend the full path to this name so that we
		// don't have to rely on the PATH inside the
		// USER_JOB_WRAPPER or for exec().

    bool transfer_exe = false;
    if (!JobAd->LookupBool(ATTR_TRANSFER_EXECUTABLE, transfer_exe)) {
        transfer_exe = false;
    }

    bool preserve_rel = false;
    if (!JobAd->LookupBool(ATTR_PRESERVE_RELATIVE_EXECUTABLE, preserve_rel)) {
        preserve_rel = false;
    }

    bool relative_exe = is_relative_to_cwd(JobName.Value());

    if (relative_exe && preserve_rel && !transfer_exe) {
        dprintf(D_ALWAYS, "Preserving relative executable path: %s\n", JobName.Value());
    }
	else if ( strcmp(CONDOR_EXEC,JobName.Value()) == 0 ) {
		JobName.sprintf( "%s%c%s",
		                 Starter->GetWorkingDir(),
		                 DIR_DELIM_CHAR,
		                 CONDOR_EXEC );
    }
	else if (relative_exe && job_iwd && *job_iwd) {
		MyString full_name;
		full_name.sprintf("%s%c%s",
		                  job_iwd,
		                  DIR_DELIM_CHAR,
		                  JobName.Value());
		JobName = full_name;

	}

	if( Starter->isGridshell() ) {
			// if we're a gridshell, just try to chmod our job, since
			// globus probably transfered it for us and left it with
			// bad permissions...
		priv_state old_priv = set_user_priv();
		int retval = chmod( JobName.Value(), S_IRWXU | S_IRWXO | S_IRWXG );
		set_priv( old_priv );
		if( retval < 0 ) {
			dprintf ( D_ALWAYS, "Failed to chmod %s!\n", JobName.Value() );
			return 0;
		}
	} 

	ArgList args;

		// Since we may be adding to the argument list, we may need to deal
		// with platform-specific arg syntax in the user's args in order
		// to successfully merge them with the additional wrapper args.
	args.SetArgV1SyntaxToCurrentPlatform();

		// First, put "condor_exec" or whatever at the front of Args,
		// since that will become argv[0] of what we exec(), either
		// the wrapper or the actual job.

	if( !getArgv0() ) {
		args.AppendArg(JobName.Value());
	} else {
		args.AppendArg(getArgv0());
	}
	
		// Support USER_JOB_WRAPPER parameter...
	char *wrapper = NULL;
	if( (wrapper=param("USER_JOB_WRAPPER")) ) {

			// make certain this wrapper program exists and is executable
		if( access(wrapper,X_OK) < 0 ) {
			dprintf( D_ALWAYS, 
					 "Cannot find/execute USER_JOB_WRAPPER file %s\n",
					 wrapper );
			free( wrapper );
			return 0;
		}
		has_wrapper = true;
			// Now, we've got a valid wrapper.  We want that to become
			// "JobName" so we exec it directly, and we want to put
			// what was the JobName (with the full path) as the first
			// argument to the wrapper
		args.AppendArg(JobName.Value());
		JobName = wrapper;
		free(wrapper);
	}
	
		// Support USE_PARROT 
	bool use_parrot = false;
	if( JobAd->LookupBool( ATTR_USE_PARROT, use_parrot) ) {
			// Check for parrot executable
		char *parrot = NULL;
		if( (parrot=param("PARROT")) ) {
			if( access(parrot,X_OK) < 0 ) {
				dprintf( D_ALWAYS, "Unable to use parrot(Cannot find/execute "
					"at %s(%s)).\n", parrot, strerror(errno) );
				free( parrot );
				return 0;
			} else {
				args.AppendArg(JobName.Value());
				JobName = parrot;
				free( parrot );
			}
		} else {
			dprintf( D_ALWAYS, "Unable to use parrot(Undefined path in config"
			" file)" );
			return 0;
		}
	}

		// Either way, we now have to add the user-specified args as
		// the rest of the Args string.
	MyString args_error;
	if(!args.AppendArgsFromClassAd(JobAd,&args_error)) {
		dprintf(D_ALWAYS, "Failed to read job arguments from JobAd.  "
				"Aborting OsProc::StartJob: %s\n",args_error.Value());
		return 0;
	}

		// // // // // // 
		// Environment 
		// // // // // // 

		// Now, instantiate an Env object so we can manipulate the
		// environment as needed.
	Env job_env;

	MyString env_errors;
	if( !Starter->GetJobEnv(JobAd,&job_env,&env_errors) ) {
		dprintf( D_ALWAYS, "Aborting OSProc::StartJob: %s\n",
				 env_errors.Value());
		return 0;
	}


		// // // // // // 
		// Standard Files
		// // // // // // 

	// handle stdin, stdout, and stderr redirection
	int fds[3];
		// initialize these to -2 to mean they're not specified.
		// -1 will be treated as an error.
	fds[0] = -2; fds[1] = -2; fds[2] = -2;

		// in order to open these files we must have the user's privs:
	priv_state priv;
	priv = set_user_priv();

		// if we're in PrivSep mode, we won't necessarily be able to
		// open the files for the job. getStdFile will return us an
		// open FD in some situations, but otherwise will give us
		// a filename that we'll pass to the PrivSep Switchboard
		//
	bool stdin_ok;
	bool stdout_ok;
	bool stderr_ok;
	MyString privsep_stdin_name;
	MyString privsep_stdout_name;
	MyString privsep_stderr_name;
	if (privsep_helper != NULL) {
		stdin_ok = getStdFile(SFT_IN,
		                      NULL,
		                      true,
		                      "Input file",
		                      &fds[0],
		                      &privsep_stdin_name);
		stdout_ok = getStdFile(SFT_OUT,
		                       NULL,
		                       true,
		                       "Output file",
		                       &fds[1],
		                       &privsep_stdout_name);
		stderr_ok = getStdFile(SFT_ERR,
		                       NULL,
		                       true,
		                       "Error file",
		                       &fds[2],
		                       &privsep_stderr_name);
	}
	else {
		fds[0] = openStdFile( SFT_IN,
		                      NULL,
		                      true,
		                      "Input file");
		stdin_ok = (fds[0] != -1);
		fds[1] = openStdFile( SFT_OUT,
		                      NULL,
		                      true,
		                      "Output file");
		stdout_ok = (fds[1] != -1);
		fds[2] = openStdFile( SFT_ERR,
		                      NULL,
		                      true,
		                      "Error file");
		stderr_ok = (fds[2] != -1);
	}

	/* Bail out if we couldn't open the std files correctly */
	if( !stdin_ok || !stdout_ok || !stderr_ok ) {
		/* only close ones that had been opened correctly */
		for ( int i = 0; i <= 2; i++ ) {
			if ( fds[i] >= 0 ) {
				daemonCore->Close_FD ( fds[i] );
			}
		}
		dprintf(D_ALWAYS, "Failed to open some/all of the std files...\n");
		dprintf(D_ALWAYS, "Aborting OsProc::StartJob.\n");
		set_priv(priv); /* go back to original priv state before leaving */
		return 0;
	}

		// // // // // // 
		// Misc + Exec
		// // // // // // 

	if( !ThisProcRunsAlongsideMainProc() ) {
		Starter->jic->notifyJobPreSpawn();
	}

	// compute job's renice value by evaluating the machine's
	// JOB_RENICE_INCREMENT in the context of the job ad...

    char* ptmp = param( "JOB_RENICE_INCREMENT" );
	if( ptmp ) {
			// insert renice expr into our copy of the job ad
		MyString reniceAttr = "Renice = ";
		reniceAttr += ptmp;
		if( !JobAd->Insert( reniceAttr.Value() ) ) {
			dprintf( D_ALWAYS, "ERROR: failed to insert JOB_RENICE_INCREMENT "
				"into job ad, Aborting OsProc::StartJob...\n" );
			free( ptmp );
			return 0;
		}
			// evaluate
		if( JobAd->EvalInteger( "Renice", NULL, nice_inc ) ) {
			dprintf( D_ALWAYS, "Renice expr \"%s\" evaluated to %d\n",
					 ptmp, nice_inc );
		} else {
			dprintf( D_ALWAYS, "WARNING: job renice expr (\"%s\") doesn't "
					 "eval to int!  Using default of 10...\n", ptmp );
			nice_inc = 10;
		}

			// enforce valid ranges for nice_inc
		if( nice_inc < 0 ) {
			dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
					 "low: adjusted to 0\n", nice_inc );
			nice_inc = 0;
		}
		else if( nice_inc > 19 ) {
			dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
					 "high: adjusted to 19\n", nice_inc );
			nice_inc = 19;
		}

		ASSERT( ptmp );
		free( ptmp );
		ptmp = NULL;
	} else {
			// if JOB_RENICE_INCREMENT is undefined, default to 10
		nice_inc = 10;
	}

		// in the below dprintfs, we want to skip past argv[0], which
		// is sometimes condor_exec, in the Args string. 

	MyString args_string;
	args.GetArgsStringForDisplay(&args_string, 1);
	if( has_wrapper ) { 
			// print out exactly what we're doing so folks can debug
			// it, if they need to.
		dprintf( D_ALWAYS, "Using wrapper %s to exec %s\n", JobName.Value(), 
				 args_string.Value() );

		MyString wrapper_err;
		wrapper_err.sprintf("%s%c%s", Starter->GetWorkingDir(),
				 	DIR_DELIM_CHAR,
					JOB_WRAPPER_FAILURE_FILE);
		if( ! job_env.SetEnv("_CONDOR_WRAPPER_ERROR_FILE", wrapper_err.Value()) ) {
			dprintf( D_ALWAYS, "Failed to set _CONDOR_WRAPPER_ERROR_FILE environment variable\n");
		}
	} else {
		dprintf( D_ALWAYS, "About to exec %s %s\n", JobName.Value(),
				 args_string.Value() );
	}

	MyString path;
	path.sprintf("%s%c%s", Starter->GetWorkingDir(),
			 	DIR_DELIM_CHAR,
				MACHINE_AD_FILENAME);
	if( ! job_env.SetEnv("_CONDOR_MACHINE_AD", path.Value()) ) {
		dprintf( D_ALWAYS, "Failed to set _CONDOR_MACHINE_AD environment variable\n");
	}

	path.sprintf("%s%c%s", Starter->GetWorkingDir(),
			 	DIR_DELIM_CHAR,
				JOB_AD_FILENAME);
	if( ! job_env.SetEnv("_CONDOR_JOB_AD", path.Value()) ) {
		dprintf( D_ALWAYS, "Failed to set _CONDOR_JOB_AD environment variable\n");
	}

		// Grab the full environment back out of the Env object 
	if(IsFulldebug(D_FULLDEBUG)) {
		MyString env_string;
		job_env.getDelimitedStringForDisplay(&env_string);
		dprintf(D_FULLDEBUG, "Env = %s\n", env_string.Value());
	}

	// Check to see if we need to start this process paused, and if
	// so, pass the right flag to DC::Create_Process().
	int job_opt_mask = DCJOBOPT_NO_CONDOR_ENV_INHERIT;
	if (!param_boolean("JOB_INHERITS_STARTER_ENVIRONMENT",false)) {
		job_opt_mask |= DCJOBOPT_NO_ENV_INHERIT;
	}
	int suspend_job_at_exec = 0;
	JobAd->LookupBool( ATTR_SUSPEND_JOB_AT_EXEC, suspend_job_at_exec);
	if( suspend_job_at_exec ) {
		dprintf( D_FULLDEBUG, "OsProc::StartJob(): "
				 "Job wants to be suspended at exec\n" );
		job_opt_mask |= DCJOBOPT_SUSPEND_ON_EXEC;
	}

	// If there is a requested coresize for this job, enforce it.
	// It is truncated because you can't put an unsigned integer
	// into a classad. I could rewrite condor's use of ATTR_CORE_SIZE to
	// be a float, but then when that attribute is read/written to the
	// job queue log by/or shared between versions of Condor which view the
	// type of that attribute differently, calamity would arise.
	int core_size_truncated;
	size_t core_size;
	size_t *core_size_ptr = NULL;
	if ( JobAd->LookupInteger( ATTR_CORE_SIZE, core_size_truncated ) ) {
		core_size = (size_t)core_size_truncated;
		core_size_ptr = &core_size;
	}

	long rlimit_as_hard_limit = 0;
	char *rlimit_expr = param("STARTER_RLIMIT_AS");
	if (rlimit_expr) {
		classad::ClassAdParser parser;

		classad::ExprTree *tree = parser.ParseExpression(rlimit_expr);
		if (tree) {
			classad::Value val;
			int result;

			if (EvalExprTree(tree, Starter->jic->machClassAd(), JobAd, val) && 
				val.IsIntegerValue(result)) {
					rlimit_as_hard_limit = ((long)result) * 1024 * 1024;
					dprintf(D_ALWAYS, "Setting job's virtual memory rlimit to %ld megabytes\n", rlimit_as_hard_limit);
			} else {
				dprintf(D_ALWAYS, "Can't evaluate STARTER_RLIMIT_AS expression %s\n", rlimit_expr);
			}
		} else {
			dprintf(D_ALWAYS, "Can't parse STARTER_RLIMIT_AS expression: %s\n", rlimit_expr);
		}
	}

	int *affinity_mask = makeCpuAffinityMask(Starter->getMySlotNumber());

#if defined ( WIN32 )
    owner_profile_.update ();
    /*************************************************************
    NOTE: We currently *ONLY* support loading slot-user profiles.
    This limitation will be addressed shortly, by allowing regular 
    users to load their registry hive - Ben [2008-09-31]
    **************************************************************/
    bool load_profile = false,
         run_as_owner = false;
    JobAd->LookupBool ( ATTR_JOB_LOAD_PROFILE, load_profile );
    JobAd->LookupBool ( ATTR_JOB_RUNAS_OWNER,  run_as_owner );
    if ( load_profile && !run_as_owner ) {
        if ( owner_profile_.load () ) {
            /* publish the users environment into that of the main 

            job's environment */
            if ( !owner_profile_.environment ( job_env ) ) {
                dprintf ( D_ALWAYS, "OsProc::StartJob(): Failed to "
                    "export owner's environment.\n" );
            }            
        } else {
            dprintf ( D_ALWAYS, "OsProc::StartJob(): Failed to load "
                "owner's profile.\n" );
        }
    }
#endif

		// While we are still in user priv, print out the username
#if defined(LINUX)
	if( Starter->glexecPrivSepHelper() ) {
			// TODO: if there is some way to figure out the final username,
			// print it out here or after starting the job.
		dprintf(D_ALWAYS,"Running job via glexec\n");
	}
#else
	if( false ) {
	}
#endif
	else {
		char const *username = NULL;
		char const *how = "";
		CondorPrivSepHelper* cpsh = Starter->condorPrivSepHelper();
		if( cpsh ) {
			username = cpsh->get_user_name();
			how = "via privsep switchboard ";
		}
		else {
			username = get_real_username();
		}
		if( !username ) {
			username = "******";
		}
		dprintf(D_ALWAYS,"Running job %sas user %s\n",how,username);
	}

	set_priv ( priv );

    // use this to return more detailed and reliable error message info
    // from create-process operation.
    MyString create_process_err_msg;

	if (privsep_helper != NULL) {
		const char* std_file_names[3] = {
			privsep_stdin_name.Value(),
			privsep_stdout_name.Value(),
			privsep_stderr_name.Value()
		};
		JobPid = privsep_helper->create_process(JobName.Value(),
		                                        args,
		                                        job_env,
		                                        job_iwd,
		                                        fds,
		                                        std_file_names,
		                                        nice_inc,
		                                        core_size_ptr,
		                                        1,
		                                        job_opt_mask,
		                                        family_info,
												affinity_mask,
												&create_process_err_msg);
	}
	else {
		JobPid = daemonCore->Create_Process( JobName.Value(),
		                                     args,
		                                     PRIV_USER_FINAL,
		                                     1,
		                                     FALSE,
		                                     &job_env,
		                                     job_iwd,
		                                     family_info,
		                                     NULL,
		                                     fds,
		                                     NULL,
		                                     nice_inc,
		                                     NULL,
		                                     job_opt_mask, 
		                                     core_size_ptr,
                                             affinity_mask,
											 NULL,
                                             &create_process_err_msg,
					     fs_remap,
					     rlimit_as_hard_limit,
                                             network_manager);
	}

	// Create_Process() saves the errno for us if it is an "interesting" error.
	int create_process_errno = errno;

    // errno is 0 in the privsep case.  This executes for the daemon core create-process logic
    if ((FALSE == JobPid) && (0 != create_process_errno)) {
        if (create_process_err_msg != "") create_process_err_msg += " ";
        MyString errbuf;
        errbuf.sprintf("(errno=%d: '%s')", create_process_errno, strerror(create_process_errno));
        create_process_err_msg += errbuf;
    }

	// now close the descriptors in fds array.  our child has inherited
	// them already, so we should close them so we do not leak descriptors.
	// NOTE, we want to use a special method to close the starter's
	// versions, if that's what we're using, so we don't think we've
	// still got those available in other parts of the code for any
	// reason.
	for ( int i = 0; i <= 2; i++ ) {
		if ( fds[i] >= 0 ) {
			daemonCore->Close_FD ( fds[i] );
		}
	}

	if ( JobPid == FALSE ) {
		JobPid = -1;

		if(!create_process_err_msg.IsEmpty()) {

			// if the reason Create_Process failed was that registering
			// a family with the ProcD failed, it is indicative of a
			// problem regarding this execute machine, not the job. in
			// this case, we'll want to EXCEPT instead of telling the
			// Shadow to put the job on hold. there are probably other
			// error conditions where EXCEPTing would be more appropriate
			// as well...
			//
			if (create_process_errno == DaemonCore::ERRNO_REGISTRATION_FAILED) {
				EXCEPT("Create_Process failed to register the job with the ProcD");
			}

			MyString err_msg = "Failed to execute '";
			err_msg += JobName;
			err_msg += "'";
			if(!args_string.IsEmpty()) {
				err_msg += " with arguments ";
				err_msg += args_string.Value();
			}
			err_msg += ": ";
			err_msg += create_process_err_msg;
			if( !ThisProcRunsAlongsideMainProc() ) {
				Starter->jic->notifyStarterError( err_msg.Value(),
			    	                              true,
			        	                          CONDOR_HOLD_CODE_FailedToCreateProcess,
			            	                      create_process_errno );
			}
		}

		dprintf(D_ALWAYS,"Create_Process(%s,%s, ...) failed: %s\n",
			JobName.Value(), args_string.Value(), create_process_err_msg.Value());
		return 0;
	}

	num_pids++;

	dprintf(D_ALWAYS,"Create_Process succeeded, pid=%d\n",JobPid);

	job_start_time.getTime();

	return 1;
}
예제 #10
0
DCMsg::MessageClosureEnum
ScheddNegotiate::messageReceived( DCMessenger *messenger, Sock *sock )
{
		// This is called when readMsg() returns true.
		// Now carry out the negotiator's request that we just read.

	switch( m_operation ) {
	case REJECTED:
		m_reject_reason = "Unknown reason";
	case REJECTED_WITH_REASON:
		scheduler_handleJobRejected( m_current_job_id, m_reject_reason.c_str() );

		m_jobs_rejected++;
		setAutoClusterRejected( m_current_auto_cluster_id );
		nextJob();
		break;

	case SEND_JOB_INFO:
		if( !sendJobInfo(sock) ) {
				// We failed to talk to the negotiator, so close the socket.
			return MESSAGE_FINISHED;
		}
		break;

	case PERMISSION_AND_AD: {
		// If the slot we matched is partitionable, edit it
		// so it will look like the resulting dynamic slot. 
		// NOTE: Seems like we no longer need to do this here,
		// since we also do the fixup at claim time in
		// contactStartd().  - Todd 1/12 <*****@*****.**>
		if( !fixupPartitionableSlot(&m_current_job_ad,&m_match_ad) )
		{
			nextJob();
			break;
		}

		std::string slot_name_buf;
		m_match_ad.LookupString(ATTR_NAME,slot_name_buf);
		char const *slot_name = slot_name_buf.c_str();

		int offline = false;
		m_match_ad.EvalBool(ATTR_OFFLINE,NULL,offline);

		if( offline ) {
			dprintf(D_ALWAYS,"Job %d.%d matched to offline machine %s.\n",
					m_current_job_id.cluster,m_current_job_id.proc,slot_name);
			nextJob();
			break;
		}

		if( scheduler_handleMatch(m_current_job_id,m_claim_id.c_str(),m_match_ad,slot_name) )
		{
			m_jobs_matched++;
		}
		nextJob();
		break;
	}

	case END_NEGOTIATE:
		dprintf( D_ALWAYS, "Lost priority - %d jobs matched\n",
				 m_jobs_matched );

		m_negotiation_finished = true;
		break;
	default:
		EXCEPT("should never get here (negotiation op %d)",m_operation);
	}


	if( m_negotiation_finished ) {
			// the following function takes ownership of sock
		scheduler_handleNegotiationFinished( sock );
		sock = NULL;
	}
	else {
			// wait for negotiator to write a response
		messenger->startReceiveMsg( this, sock );
	}

		// By returning MESSAGE_CONTINUING, we tell messenger not to
		// close the socket.  Either we have finished negotiating and
		// sock has been taken care of by the scheduler (e.g. by
		// registering it to wait for the next NEGOTIATE command), or
		// we are not yet done with negotiating and we are waiting for
		// the next operation within the current negotiation round.
	return MESSAGE_CONTINUING;
}
예제 #11
0
bool
ScheddNegotiate::writeMsg( DCMessenger * /*messenger*/, Sock * /*sock*/ )
{
	EXCEPT("this should never be called");
	return false;
}
예제 #12
0
int
stdin_pipe_handler(Service*, int) {

	std::string* line;
	while ((line = stdin_buffer.GetNextLine()) != NULL) {

		const char * command = line->c_str();

		// CREATE_CONDOR_SECURITY_SESSION contains sensitive data that
		// normally shouldn't be written to a publically-readable log.
		// We should conceal it unless GAHP_DEBUG_HIDE_SENSITIVE_DATA
		// says not to.
		if ( param_boolean( "GAHP_DEBUG_HIDE_SENSITIVE_DATA", true ) &&
			 strncmp( command, GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					  strlen( GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION ) ) == 0 ) {
			dprintf( D_ALWAYS, "got stdin: %s XXXXXXXX\n",
					 GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION );
		} else {
			dprintf (D_ALWAYS, "got stdin: %s\n", command);
		}

		Gahp_Args args;

		if (parse_gahp_command (command, &args) &&
			verify_gahp_command (args.argv, args.argc)) {

				// Catch "special commands first
			if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) {
					// Print number of results
				std::string rn_buff;
				formatstr( rn_buff, "%d", result_list.number() );
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					rn_buff.c_str() };
				gahp_output_return (commands, 2);

					// Print each result line
				char * next;
				result_list.rewind();
				while ((next = result_list.next()) != NULL) {
					printf ("%s\n", next);
					fflush(stdout);
					dprintf(D_FULLDEBUG,"put stdout: %s\n",next);
					result_list.deleteCurrent();
				}

				new_results_signaled = FALSE;
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) {
				printf ("S %s\n", version);
				fflush (stdout);
				dprintf(D_FULLDEBUG,"put stdout: S %s\n",version);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				DC_Exit(0);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) {
				async_mode = TRUE;
				new_results_signaled = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) {
				async_mode = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				return 0; // exit
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) {
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					GAHP_COMMAND_DOWNLOAD_SANDBOX,
					GAHP_COMMAND_UPLOAD_SANDBOX,
					GAHP_COMMAND_DESTROY_SANDBOX,
					GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					GAHP_COMMAND_CONDOR_VERSION,
					GAHP_COMMAND_ASYNC_MODE_ON,
					GAHP_COMMAND_ASYNC_MODE_OFF,
					GAHP_COMMAND_RESULTS,
					GAHP_COMMAND_QUIT,
					GAHP_COMMAND_VERSION,
					GAHP_COMMAND_COMMANDS};
				gahp_output_return (commands, 12);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION) == 0) {
				ClaimIdParser claimid( args.argv[1] );
				if ( !daemonCore->getSecMan()->CreateNonNegotiatedSecuritySession(
										DAEMON,
										claimid.secSessionId(),
										claimid.secSessionKey(),
										claimid.secSessionInfo(),
										CONDOR_PARENT_FQU,
										NULL,
										0 ) ) {
					gahp_output_return_error();
				} else {
					sec_session_id = claimid.secSessionId();
					gahp_output_return_success();
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CONDOR_VERSION) == 0) {
				peer_condor_version = args.argv[1];

				const char *reply [] = { GAHP_RESULT_SUCCESS,
										 escapeGahpString( CondorVersion() ) };
				gahp_output_return( reply, 2 );

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) {

				int fds[2] = {-1,-1};
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[2] = {
						"Worker thread failed",
						"NULL"
					};
					enqueue_result(args.argv[1], res, 2);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) {

				int fds[2] = {-1,-1};
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) {

				int fds[2] = {-1,-1};
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else {
				// should never get here if verify does its job
				dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]);
				gahp_output_return_error();
			}
			
		} else {
			gahp_output_return_error();
		}

		delete line;
	}

	// check if GetNextLine() returned NULL because of an error or EOF
	if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) {
		dprintf (D_ALWAYS, "stdin buffer closed, exiting\n");
		DC_Exit (1);
	}

	return TRUE;
}
예제 #13
0
void
main_init( int, char ** const)
{
	dprintf(D_ALWAYS, "FT-GAHP IO thread\n");
	dprintf(D_SECURITY | D_FULLDEBUG, "FT-GAHP IO thread\n");

	int stdin_pipe = -1;
#if defined(WIN32)
	// if our parent is not DaemonCore, then our assumption that
	// the pipe we were passed in via stdin is overlapped-mode
	// is probably wrong. we therefore create a new pipe with the
	// read end overlapped and start a "forwarding thread"
	if (daemonCore->InfoCommandSinfulString(daemonCore->getppid()) == NULL) {

		dprintf(D_FULLDEBUG, "parent is not DaemonCore; creating a forwarding thread\n");

		int pipe_ends[2];
		if (daemonCore->Create_Pipe(pipe_ends, true) == FALSE) {
			EXCEPT("failed to create forwarding pipe");
		}
		forwarding_pipe = pipe_ends[1];
		HANDLE thread_handle = (HANDLE)_beginthreadex(NULL,                   // default security
		                                              0,                      // default stack size
		                                              pipe_forward_thread,    // start function
		                                              NULL,                   // arg: write end of pipe
		                                              0,                      // not suspended
													  NULL);                  // don't care about the ID
		if (thread_handle == NULL) {
			EXCEPT("failed to create forwarding thread");
		}
		CloseHandle(thread_handle);
		stdin_pipe = pipe_ends[0];
	}
#endif

	if (stdin_pipe == -1) {
		// create a DaemonCore pipe from our stdin
		stdin_pipe = daemonCore->Inherit_Pipe(fileno(stdin),
		                                      false,    // read pipe
		                                      true,     // registerable
		                                      false);   // blocking
	}

	stdin_buffer.setPipeEnd(stdin_pipe);

	(void)daemonCore->Register_Pipe (stdin_buffer.getPipeEnd(),
					"stdin pipe",
					(PipeHandler)&stdin_pipe_handler,
					"stdin_pipe_handler");

	// Setup dprintf to display pid
	DebugId = display_dprintf_header;

		// Print out the GAHP version to the screen
		// now we're ready to roll
	printf ("%s\n", version);
	fflush(stdout);
	dprintf(D_FULLDEBUG,"put stdout: %s\n",version);

	download_sandbox_reaper_id = daemonCore->Register_Reaper(
				"download_sandbox_reaper",
				&download_sandbox_reaper,
				"download_sandbox",
				NULL
				);
	dprintf(D_FULLDEBUG, "registered download_sandbox_reaper() at %i\n", download_sandbox_reaper_id);

	upload_sandbox_reaper_id = daemonCore->Register_Reaper(
				"upload_sandbox_reaper",
				&upload_sandbox_reaper,
				"upload_sandbox",
				NULL
				);
	dprintf(D_FULLDEBUG, "registered upload_sandbox_reaper() at %i\n", upload_sandbox_reaper_id);

	destroy_sandbox_reaper_id = daemonCore->Register_Reaper(
				"destroy_sandbox_reaper",
				&destroy_sandbox_reaper,
				"destroy_sandbox",
				NULL
				);
	dprintf(D_FULLDEBUG, "registered destroy_sandbox_reaper() at %i\n", destroy_sandbox_reaper_id);

	dprintf (D_FULLDEBUG, "FT-GAHP IO initialized\n");
}
예제 #14
0
static void doDBconfig() {

	char *tmp, *host = NULL, *port = NULL, *DBIpAddress=NULL, *DBName=NULL, *DBUser=NULL;
	int len;

	if (param_boolean("QUILL_ENABLED", false) == false) {
		EXCEPT("Quill++ is currently disabled. Please set QUILL_ENABLED to "
			   "TRUE if you want this functionality and read the manual "
			   "about this feature since it requires other attributes to be "
			   "set properly.");
	}

		//bail out if no SPOOL variable is defined since its used to 
		//figure out the location of the .pgpass file
	char *spool = param("SPOOL");
	if(!spool) {
		EXCEPT("No SPOOL variable found in config file\n");
	}
  
	tmp = param("QUILL_DB_TYPE");
	if (tmp) {
		if (strcasecmp(tmp, "PGSQL") == 0) {
			dt = T_PGSQL;
		}
		free(tmp);
	} else {
		dt = T_PGSQL; // assume PGSQL by default
	}

		/*
		  Here we try to read the <ipaddress:port> stored in condor_config
		  if one is not specified, by default we use the local address 
		  and the default postgres port of 5432.  
		*/
	DBIpAddress = param("QUILL_DB_IP_ADDR");
	if(DBIpAddress) {
		len = strlen(DBIpAddress);
		host = (char *) malloc(len * sizeof(char));
		port = (char *) malloc(len * sizeof(char));

			//split the <ipaddress:port> into its two parts accordingly
		char *ptr_colon = strchr(DBIpAddress, ':');
		strncpy(host, DBIpAddress, 
				ptr_colon - DBIpAddress);
			// terminate the string properly
		host[ptr_colon - DBIpAddress] = '\0';
		strncpy(port, ptr_colon+1, len);
			// terminate the string properyly
		port[strlen(ptr_colon+1)] = '\0';
	}

		/* Here we read the database name and if one is not specified
		   use the default name - quill
		   If there are more than one quill daemons are writing to the
		   same databases, its absolutely necessary that the database
		   names be unique or else there would be clashes.  Having 
		   unique database names is the responsibility of the administrator
		*/
	DBName = param("QUILL_DB_NAME");

	DBUser = param("QUILL_DB_USER");

		// get the password from the .pgpass file
	MyString writePasswordFile; 
	writePasswordFile.sprintf("%s/.pgpass", spool);

	MyString writePassword = getWritePassword(writePasswordFile.Value(), 
										   host?host:"", port?port:"", 
										   DBName?DBName:"", 
										   DBUser);
	MyString DBConn;

	DBConn.sprintf("host=%s port=%s user=%s password=%s dbname=%s", 
				   host?host:"", port?port:"", 
				   DBUser?DBUser:"", 
				   writePassword.Value(), 
				   DBName?DBName:"");
  	
	fprintf(stdout, "Using Database Type = Postgres\n");
	fprintf(stdout, "Using Database IpAddress = %s\n", 
			DBIpAddress?DBIpAddress:"");
	fprintf(stdout, "Using Database Name = %s\n", 
			DBName?DBName:"");
	fprintf(stdout, "Using Database User = %s\n", 
			DBUser?DBUser:"");

	if(spool) {
		free(spool);
		spool = NULL;
	}

	if(host) {
		free(host);
		host = NULL;
	}

	if(port) {
		free(port);
		port = NULL;
	}

	switch (dt) {				
		case T_PGSQL:
			DBObj = new PGSQLDatabase(DBConn.Value());
			break;
		default:
			break;;
	}

	QuillErrCode ret_st;

	ret_st = DBObj->connectDB();
	if (ret_st == QUILL_FAILURE) {
		fprintf(stderr, "doDBconfig: unable to connect to DB--- ERROR");
		exit(1);
	}

/*		
	tmp = param( "SCHEDD_NAME" );
	if( tmp ) {
		scheddname = build_valid_daemon_name( tmp );
		dprintf(D_FULLDEBUG, "scheddname %s built from param value %s\n", 
				scheddname, tmp);
		free(tmp);

	} else {
		scheddname = default_daemon_name();
		dprintf(D_FULLDEBUG, "scheddname built from default daemon name: %s\n", scheddname);
	}
*/	

	free(DBIpAddress);
	free(DBName);
	free(DBUser);
}
예제 #15
0
void* DaemonCore::Stats::New(const char * category, const char * name, int as)
{
   MyString attr;
   attr.formatstr("DC%s_%s", category, name);
   cleanStringForUseAsAttr(attr);

   void * ret = NULL;
   switch (as & (AS_TYPE_MASK | IS_CLASS_MASK))
      {
      case AS_COUNT | IS_RECENT:
         {
         stats_entry_recent<int> * probe =
         Pool.NewProbe< stats_entry_recent<int> >(name,  attr.Value(), as);
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

     case  AS_COUNT | IS_CLS_EMA:
         {
         stats_entry_ema<int>* probe =
         Pool.NewProbe< stats_entry_ema<int> >(name, attr.Value(), as | stats_entry_ema<int>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  STATS_ENTRY_TYPE_DOUBLE | IS_CLS_EMA:
         {
         stats_entry_ema<double>* probe =
         Pool.NewProbe< stats_entry_ema<double> >(name, attr.Value(), as | stats_entry_ema<double>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  AS_COUNT | IS_CLS_SUM_EMA_RATE:
         {
         stats_entry_sum_ema_rate<int>* probe =
         Pool.NewProbe< stats_entry_sum_ema_rate<int> >(name, attr.Value(), as | stats_entry_sum_ema_rate<int>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  STATS_ENTRY_TYPE_DOUBLE | IS_CLS_SUM_EMA_RATE:
         {
         stats_entry_sum_ema_rate<double>* probe =
         Pool.NewProbe< stats_entry_sum_ema_rate<double> >(name, attr.Value(), as | stats_entry_sum_ema_rate<double>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

      case AS_ABSTIME | IS_RECENT:
      case AS_RELTIME | IS_RECENT:
         {
         stats_entry_recent<time_t> * probe =
         Pool.NewProbe< stats_entry_recent<time_t> >(name,  attr.Value(), as);
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

      case AS_COUNT | IS_RCT:
#ifdef USE_MIRON_PROBE_FOR_DC_RUNTIME_STATS
         {
         as &= ~(IS_CLASS_MASK);  // strip off IS_RTC class
         as |= IS_CLS_PROBE | IF_RT_SUM; // and set IS_CLS_PROBE & IF_RT_SUM classes
         stats_entry_probe<double> * probe =
         Pool.NewProbe< stats_entry_probe<double> >(name, attr.Value(), as);
         ret = probe;
         }
         break;
#else
          // fall through
#endif
      case AS_RELTIME | IS_RCT:
         {
         stats_recent_counter_timer * probe =
         Pool.NewProbe<stats_recent_counter_timer>(name, attr.Value(), as);
        #if 0 // def DEBUG
         attr += "Debug";
         Pool.AddPublish(attr.Value(), probe, NULL, 0, 
                       (FN_STATS_ENTRY_PUBLISH)&stats_recent_counter_timer::PublishDebug);
        #endif
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

      default:
         EXCEPT("unsupported probe type");
         break;
      }

   return ret;
}
예제 #16
0
bool VanillaToGrid::vanillaToGrid(classad::ClassAd * ad, int target_universe, const char * gridresource, bool is_sandboxed)
{
	ASSERT(ad);

	/* TODO:
		- If job fails to specify transfer_input_files but has some (relying on
		  a shared filesystem), we're doomed.  Probably
		  if(should_transfer_files is NO, we should just fail.
		  (We'll do the right thing if grid_type=condor, but not other
		  grid types.)

		- Job may be relying on Condor automatically transfer output files.  If
		  transfer_output_files is set, we're probably good (again, assuming no
		  shared filesystem).  If it's not set, we can't tell the difference
		  between no output files and "everything"
		  (We'll do the right thing if grid_type=condor, but not other
		  grid types.)

		- I'm leaving WhenToTransferFiles alone.  It should work fine.

		- Should I leave FilesystemDomain in?  May be useful, but
		simultaneously will likely do the Wrong Thing on the remote
		side
	*/

	MyString remoteattr;
	remoteattr = "Remote_";
	remoteattr += ATTR_JOB_UNIVERSE;

	// Things we don't need or want
	ad->Delete(ATTR_CLUSTER_ID); // Definate no-no
	ad->Delete(ATTR_PROC_ID);


	ad->Delete(ATTR_BUFFER_BLOCK_SIZE);
	ad->Delete(ATTR_BUFFER_SIZE);
	ad->Delete(ATTR_BUFFER_SIZE);
	ad->Delete("CondorPlatform"); // TODO: Find #define
	ad->Delete("CondorVersion");  // TODO: Find #define
	ad->Delete(ATTR_CORE_SIZE);
	ad->Delete(ATTR_GLOBAL_JOB_ID); // Store in different ATTR here?
	//ad->Delete(ATTR_OWNER); // How does schedd filter? 
	ad->Delete(ATTR_Q_DATE);
	ad->Delete(ATTR_JOB_REMOTE_WALL_CLOCK);
	ad->Delete(ATTR_SERVER_TIME);
	ad->Delete(ATTR_AUTO_CLUSTER_ID);
	ad->Delete(ATTR_AUTO_CLUSTER_ATTRS);
	ad->Delete(ATTR_TOTAL_SUBMIT_PROCS);
	ad->Delete( ATTR_STAGE_IN_FINISH );
	ad->Delete( ATTR_STAGE_IN_START );

	// We aren't going to forward updates to this attribute,
	// so strip it out.
	// We do evaluate it locally in the source job ad.
	ad->Delete(ATTR_TIMER_REMOVE_CHECK);

	ad->Delete("SUBMIT_" ATTR_JOB_IWD); // the presence of this would prevent schedd from rewriting spooled iwd

	// Stuff to reset
	ad->InsertAttr(ATTR_JOB_STATUS, 1); // Idle
	ad->InsertAttr(ATTR_JOB_REMOTE_USER_CPU, 0.0);
	ad->InsertAttr(ATTR_JOB_REMOTE_SYS_CPU, 0.0);
	ad->InsertAttr(ATTR_JOB_EXIT_STATUS, 0);
	ad->InsertAttr(ATTR_COMPLETION_DATE, 0);
	ad->InsertAttr(ATTR_JOB_LOCAL_SYS_CPU, 0.0);
	ad->InsertAttr(ATTR_JOB_LOCAL_USER_CPU, 0.0);
	ad->InsertAttr(ATTR_NUM_CKPTS, 0);
	ad->InsertAttr(ATTR_NUM_RESTARTS, 0);
	ad->InsertAttr(ATTR_NUM_SYSTEM_HOLDS, 0);
	ad->InsertAttr(ATTR_JOB_COMMITTED_TIME, 0);
	ad->InsertAttr(ATTR_COMMITTED_SLOT_TIME, 0);
	ad->InsertAttr(ATTR_CUMULATIVE_SLOT_TIME, 0);
	ad->InsertAttr(ATTR_TOTAL_SUSPENSIONS, 0);
	ad->InsertAttr(ATTR_LAST_SUSPENSION_TIME, 0);
	ad->InsertAttr(ATTR_CUMULATIVE_SUSPENSION_TIME, 0);
	ad->InsertAttr(ATTR_COMMITTED_SUSPENSION_TIME, 0);
	ad->InsertAttr(ATTR_ON_EXIT_BY_SIGNAL, false);


	//ad->Delete(ATTR_MY_TYPE); // Should be implied
	//ad->Delete(ATTR_TARGET_TYPE); // Should be implied.

	// Remap the universe
	classad::ExprTree * tmp;
	tmp = ad->Lookup(ATTR_JOB_UNIVERSE);
	if( ! tmp ) {
		EXCEPT("VanillaToGrid: job ad lacks universe");
	}
	classad::ExprTree * olduniv = tmp->Copy();
	if( ! olduniv) {
		EXCEPT("Unable to copy old universe");
	}

	ad->InsertAttr(ATTR_JOB_UNIVERSE, target_universe);
	ad->Insert(remoteattr.Value(), olduniv, false);
		// olduniv is now controlled by ClassAd

	if( target_universe == CONDOR_UNIVERSE_GRID ) {
		ad->Delete(ATTR_CURRENT_HOSTS);

		// Set the grid resource
		if( gridresource ) {
			ad->InsertAttr(ATTR_GRID_RESOURCE, gridresource);
		}

		// Grid universe, unlike vanilla universe expects full output
		// paths for Out/Err.  In vanilla, these are basenames that
		// point into TransferOutputRemaps.  If the job is sandboxed,
		// then our remaps will apply when we fetch the output from
		// the completed job.  That's fine, so we leave the remaps
		// in place in that case.  Otherwise, we undo the remaps and
		// let the grid job write directly to the correct output
		// paths.

		std::string remaps;
		ad->EvaluateAttrString(ATTR_TRANSFER_OUTPUT_REMAPS,remaps);
		if( !is_sandboxed && remaps.size() ) {
			MyString remap_filename;
			std::string filename,filenames;

				// Don't need the remaps in the grid copy of the ad.
			ad->Delete(ATTR_TRANSFER_OUTPUT_REMAPS);

			if( ad->EvaluateAttrString(ATTR_JOB_OUTPUT,filename) ) {
				if( filename_remap_find(remaps.c_str(),filename.c_str(),remap_filename) ) {
					ad->InsertAttr(ATTR_JOB_OUTPUT,remap_filename.Value());
				}
			}

			if( ad->EvaluateAttrString(ATTR_JOB_ERROR,filename) ) {
				if( filename_remap_find(remaps.c_str(),filename.c_str(),remap_filename) ) {
					ad->InsertAttr(ATTR_JOB_ERROR,remap_filename.Value());
				}
			}

				// TransferOutputFiles appears to be different.  If it
				// behaved similarly to Out/Err, then we would want to
				// do the following:
#if 0
			if( ad->EvaluateAttrString(ATTR_TRANSFER_OUTPUT_FILES,filename) ) {
				StringList output_files(filename.c_str(),",");
				StringList new_list;
				char const *fname;

				output_files.rewind();
				while( (fname=output_files.next()) ) {
					if( filename_remap_find(remaps.c_str(),fname,remap_filename) )
						{
							new_list.append(remap_filename.Value());
						}
					else {
						new_list.append(fname);
					}
				}

				char *new_list_str = new_list.print_to_string();
				ASSERT( new_list_str );

				ad->InsertAttr(ATTR_TRANSFER_OUTPUT_FILES,new_list_str);

				free( new_list_str );
			}
#endif

		}
	}

	return true;
}
예제 #17
0
파일: statsd.cpp 프로젝트: blueskyll/condor
bool
Metric::evaluateDaemonAd(classad::ClassAd &metric_ad,classad::ClassAd const &daemon_ad,int max_verbosity,StatsD *statsd,ExtArray<MyString> *regex_groups,char const *regex_attr)
{
    if( regex_attr ) {
        name = regex_attr;
    }
    if( !evaluateOptionalString(ATTR_NAME,name,metric_ad,daemon_ad,regex_groups) ) return false;

    metric_ad.EvaluateAttrInt(ATTR_VERBOSITY,verbosity);
    if( verbosity > max_verbosity ) {
        // avoid doing more work; this metric requires higher verbosity
        return false;
    }

    std::string target_type_str;
    if( !evaluateOptionalString(ATTR_TARGET_TYPE,target_type_str,metric_ad,daemon_ad,regex_groups) ) return false;
    target_type.initializeFromString(target_type_str.c_str());
    if( target_type.contains_anycase("machine_slot1") ) {
        restrict_slot1 = true;
    }

    std::string my_type;
    daemon_ad.EvaluateAttrString(ATTR_MY_TYPE,my_type);
    if( !target_type.isEmpty() && !target_type.contains_anycase("any") ) {
        if( restrict_slot1 && !strcasecmp(my_type.c_str(),"machine") ) {
            int slotid = 1;
            daemon_ad.EvaluateAttrInt(ATTR_SLOT_ID,slotid);
            if( slotid != 1 ) {
                return false;
            }
            bool dynamic_slot = false;
            daemon_ad.EvaluateAttrBool(ATTR_SLOT_DYNAMIC,dynamic_slot);
            if( dynamic_slot ) {
                return false;
            }
        }
        else if( !target_type.contains_anycase(my_type.c_str()) ) {
            // avoid doing more work; this is not the right type of daemon ad
            return false;
        }
    }

    classad::Value requirements_val;
    requirements_val.SetBooleanValue(true);
    if( !evaluate(ATTR_REQUIREMENTS,requirements_val,metric_ad,daemon_ad,BOOLEAN,regex_groups,regex_attr) ) {
        return false;
    }
    bool requirements = true;
    if( !requirements_val.IsBooleanValue(requirements) || requirements!=true ) {
        return false;
    }

    if( !regex_attr ) {
        std::string regex;
        if( !evaluateOptionalString(ATTR_REGEX,regex,metric_ad,daemon_ad,NULL) ) return false;
        if( !regex.empty() ) {
            Regex re;
            const char *errptr=NULL;
            int erroffset=0;
            if( !re.compile(regex.c_str(),&errptr,&erroffset,PCRE_ANCHORED) ) {
                EXCEPT("Invalid regex %s",regex.c_str());
            }
            for( classad::ClassAd::const_iterator itr = daemon_ad.begin();
                    itr != daemon_ad.end();
                    itr++ )
            {
                ExtArray<MyString> the_regex_groups;
                if( re.match(itr->first.c_str(),&the_regex_groups) ) {
                    // make a new Metric for this attribute that matched the regex
                    counted_ptr<Metric> metric(statsd->newMetric());
                    metric->evaluateDaemonAd(metric_ad,daemon_ad,max_verbosity,statsd,&the_regex_groups,itr->first.c_str());
                }
            }
            return false;
        }
    }

    std::string aggregate_str;
    if( !evaluateOptionalString(ATTR_AGGREGATE,aggregate_str,metric_ad,daemon_ad,regex_groups) ) return false;
    aggregate = NO_AGGREGATE;
    if( strcasecmp(aggregate_str.c_str(),"sum")==0 ) {
        aggregate = SUM;
    }
    else if( strcasecmp(aggregate_str.c_str(),"avg")==0 ) {
        aggregate = AVG;
    }
    else if( strcasecmp(aggregate_str.c_str(),"min")==0 ) {
        aggregate = MIN;
    }
    else if( strcasecmp(aggregate_str.c_str(),"max")==0 ) {
        aggregate = MAX;
    }
    else if( !aggregate_str.empty() ) {
        EXCEPT("Invalid aggregate function %s",aggregate_str.c_str());
    }

    // set default stats grouping
    if( isAggregateMetric() ) {
        group = "HTCondor Pool";
    }
    else if( !strcasecmp(my_type.c_str(),"scheduler") ) {
        group = "HTCondor Schedd";
    }
    else if( !strcasecmp(my_type.c_str(),"machine") ) {
        group = "HTCondor Startd";
        if( !statsd->publishPerExecuteNodeMetrics() ) {
            return false;
        }
    }
    else if( !strcasecmp(my_type.c_str(),"daemonmaster") ) {
        group = "HTCondor Master";

        if( !statsd->publishPerExecuteNodeMetrics() ) {
            std::string machine_name;
            daemon_ad.EvaluateAttrString(ATTR_MACHINE,machine_name);
            if( statsd->isExecuteOnlyNode(machine_name) ) {
                return false;
            }
        }
    }
    else {
        formatstr(group,"HTCondor %s",my_type.c_str());
    }

    if( isAggregateMetric() ) {
        // This is like GROUP BY in SQL.
        // By default, group by the name of the metric.
        aggregate_group = name;
        if( !evaluateOptionalString(ATTR_AGGREGATE_GROUP,aggregate_group,metric_ad,daemon_ad,regex_groups) ) return false;
    }

    if( !evaluateOptionalString(ATTR_GROUP,group,metric_ad,daemon_ad,regex_groups) ) return false;

    if( !evaluateOptionalString(ATTR_TITLE,title,metric_ad,daemon_ad,regex_groups) ) return false;
    if( !evaluateOptionalString(ATTR_DESC,desc,metric_ad,daemon_ad,regex_groups) ) return false;
    if( !evaluateOptionalString(ATTR_UNITS,units,metric_ad,daemon_ad,regex_groups) ) return false;
    if( !evaluateOptionalString(ATTR_CLUSTER,cluster,metric_ad,daemon_ad,regex_groups) ) return false;

    metric_ad.EvaluateAttrBool(ATTR_DERIVATIVE,derivative);
    metric_ad.EvaluateAttrNumber(ATTR_SCALE,scale);

    std::string type_str;
    if( !evaluateOptionalString(ATTR_TYPE,type_str,metric_ad,daemon_ad,regex_groups) ) return false;
    if( type_str.empty() ) {
        type = AUTO;
    }
    else if( strcasecmp(type_str.c_str(),"string")==0 ) {
        type = STRING;
    }
    else if( strcasecmp(type_str.c_str(),"int8")==0 ) {
        type = INT8;
    }
    else if( strcasecmp(type_str.c_str(),"uint8")==0 ) {
        type = UINT8;
    }
    else if( strcasecmp(type_str.c_str(),"int16")==0 ) {
        type = INT16;
    }
    else if( strcasecmp(type_str.c_str(),"uint16")==0 ) {
        type = UINT16;
    }
    else if( strcasecmp(type_str.c_str(),"int32")==0 ) {
        type = INT32;
    }
    else if( strcasecmp(type_str.c_str(),"uint32")==0 ) {
        type = UINT32;
    }
    else if( strcasecmp(type_str.c_str(),"float")==0 ) {
        type = FLOAT;
    }
    else if( strcasecmp(type_str.c_str(),"double")==0 ) {
        type = DOUBLE;
    }
    else if( strcasecmp(type_str.c_str(),"boolean")==0 ) {
        type = BOOLEAN;
    }
    else {
        EXCEPT("Invalid metric attribute type=%s for %s",type_str.c_str(),whichMetric().c_str());
        return false;
    }

    value.SetUndefinedValue();
    if( !evaluate(ATTR_VALUE,value,metric_ad,daemon_ad,type,regex_groups,regex_attr) ) {
        return false;
    }
    if( value.IsUndefinedValue() ) {
        if( regex_attr ) {
            daemon_ad.EvaluateAttr(regex_attr,value);
        }
        else {
            daemon_ad.EvaluateAttr(name,value);
        }
    }
    if( value.IsUndefinedValue() ) {
        return false;
    }
    if ( type == AUTO ) {
        if (value.IsBooleanValue() ) {
            type = BOOLEAN;
        } else if ( value.IsIntegerValue() )  {
            type = INT32;
        } else if ( value.IsNumber() ) {
            type = FLOAT;
        } else if ( value.IsStringValue() ) {
            type = STRING;
        }
    }

    if( isAggregateMetric() ) {
        machine = statsd->getDefaultAggregateHost();
    }
    else {
        if( (!strcasecmp(my_type.c_str(),"machine") && restrict_slot1) || !strcasecmp(my_type.c_str(),"collector") ) {
            // for STARTD_SLOT1 metrics, advertise by default as host.name, not slot1.host.name
            // ditto for the collector, which typically has a daemon name != machine name, even though there is only one collector
            daemon_ad.EvaluateAttrString(ATTR_MACHINE,machine);
        }
        else if( !strcasecmp(my_type.c_str(),"submitter") ) {
            daemon_ad.EvaluateAttrString(ATTR_SCHEDD_NAME,machine);
        }
        else {
            // use the daemon name for the metric machine name
            daemon_ad.EvaluateAttrString(ATTR_NAME,machine);
        }
    }
    if( !evaluateOptionalString(ATTR_MACHINE,machine,metric_ad,daemon_ad,regex_groups) ) return false;

    statsd->getDaemonIP(machine,ip);
    if( !evaluateOptionalString(ATTR_IP,ip,metric_ad,daemon_ad,regex_groups) ) return false;

    if( isAggregateMetric() ) {
        statsd->addToAggregateValue(*this);
    }
    else {
        statsd->publishMetric(*this);
    }
    return true;
}
예제 #18
0
파일: rooster.cpp 프로젝트: emaste/htcondor
void Rooster::poll()
{
	dprintf(D_FULLDEBUG,"C**k-a-doodle-doo! (Time to look for machines to wake up.)\n");

	ClassAdList startdAds;
	CondorQuery unhibernateQuery(STARTD_AD);
	ExprTree *requirements = NULL;

	if( ParseClassAdRvalExpr( m_unhibernate_constraint.Value(), requirements )!=0 || requirements==NULL )
	{
		EXCEPT("Invalid expression for ROOSTER_UNHIBERNATE: %s\n",
			   m_unhibernate_constraint.Value());
	}

	unhibernateQuery.addANDConstraint(m_unhibernate_constraint.Value());

	CollectorList* collects = daemonCore->getCollectorList();
	ASSERT( collects );

	QueryResult result;
	result = collects->query(unhibernateQuery,startdAds);
	if( result != Q_OK ) {
		dprintf(D_ALWAYS,
				"Couldn't fetch startd ads using constraint "
				"ROOSTER_UNHIBERNATE=%s: %s\n",
				m_unhibernate_constraint.Value(), getStrQueryResult(result));
		return;
	}

	dprintf(D_FULLDEBUG,"Got %d startd ads matching ROOSTER_UNHIBERNATE=%s\n",
			startdAds.MyLength(), m_unhibernate_constraint.Value());

	startdAds.Sort(StartdSortFunc,&m_rank_ad);

	startdAds.Open();
	int num_woken = 0;
	ClassAd *startd_ad;
	HashTable<MyString,bool> machines_done(MyStringHash);
	while( (startd_ad=startdAds.Next()) ) {
		MyString machine;
		MyString name;
		startd_ad->LookupString(ATTR_MACHINE,machine);
		startd_ad->LookupString(ATTR_NAME,name);

		if( machines_done.exists(machine)==0 ) {
			dprintf(D_FULLDEBUG,
					"Skipping %s: already attempted to wake up %s in this cycle.\n",
					name.Value(),machine.Value());
			continue;
		}

			// in case the unhibernate expression is time-sensitive,
			// re-evaluate it now to make sure it still passes
		if( !EvalBool(startd_ad,requirements) ) {
			dprintf(D_ALWAYS,
					"Skipping %s: ROOSTER_UNHIBERNATE is no longer true.\n",
					name.Value());
			continue;
		}

		if( wakeUp(startd_ad) ) {
			machines_done.insert(machine,true);

			if( ++num_woken >= m_max_unhibernate && m_max_unhibernate > 0 ) {
				dprintf(D_ALWAYS,
						"Reached ROOSTER_MAX_UNHIBERNATE=%d in this cycle.\n",
						m_max_unhibernate);
				break;
			}
		}
	}
	startdAds.Close();

	delete requirements;
	requirements = NULL;

	if( startdAds.MyLength() ) {
		dprintf(D_FULLDEBUG,"Done sending wakeup calls.\n");
	}
}
예제 #19
0
파일: statsd.cpp 프로젝트: blueskyll/condor
void
StatsD::ParseMetrics( std::string const &stats_metrics_string, char const *param_name, std::list< classad::ClassAd * > &stats_metrics ) {

    // Parse a list of metrics.  The expected syntax is
    // a list of ClassAds, optionally delimited by commas and or
    // whitespace.

    int offset = 0;
    while(1) {
        if(offset >= (int)stats_metrics_string.size()) break;

        int this_offset = offset; //save offset before eating an ad.

        std::string error_msg;

        classad::ClassAdParser parser;
        classad::ClassAd *ad = new classad::ClassAd;
        bool failed = false;
        if(!parser.ParseClassAd(stats_metrics_string,*ad,offset)) {
            int final_offset = this_offset;
            std::string final_stats_metrics_string = stats_metrics_string;
            final_stats_metrics_string += "\n[]"; // add an empty ClassAd

            if(parser.ParseClassAd(final_stats_metrics_string,*ad,final_offset)) {
                // There must have been some trailing whitespace or
                // comments after the last ClassAd, so the only reason
                // ParseClassAd() failed was because there was no ad.
                // Therefore, we are done.
                delete ad;
                break;
            }
            failed = true;
        }

        if( failed ) {
            EXCEPT("CONFIGURATION ERROR: error in metrics defined in %s: %s, for entry starting here: %.80s\n",
                   param_name,error_msg.c_str(),stats_metrics_string.c_str() + this_offset);
        }

        classad::ClassAd *ad2 = new ClassAd(m_default_metric_ad);
        ad2->Update(*ad);
        delete ad;
        ad = ad2;

        int verbosity = 0;
        ad->EvaluateAttrInt(ATTR_VERBOSITY,verbosity);
        if( verbosity > m_verbosity ) {
            delete ad;
            continue;
        }

        // for efficient queries to the collector, keep track of
        // which type of ads we need
        std::string target_type;
        ad->EvaluateAttrString(ATTR_TARGET_TYPE,target_type);
        if( target_type.empty() ) {
            classad::ClassAdUnParser unparser;
            std::string ad_str;
            unparser.Unparse(ad_str,ad);
            EXCEPT("CONFIGURATION ERROR: no target type specified for metric defined in %s: %s\n",
                   param_name,
                   ad_str.c_str());
        }
        StringList target_types(target_type.c_str());
        m_target_types.create_union(target_types,true);

        stats_metrics.push_back(ad);
    }
}
예제 #20
0
파일: rooster.cpp 프로젝트: emaste/htcondor
void Rooster::config()
{
	int old_polling_interval = m_polling_interval;
	m_polling_interval = param_integer("ROOSTER_INTERVAL",300);
	if( m_polling_interval < 0 ) {
		dprintf(D_ALWAYS,
				"ROOSTER_INTERVAL is less than 0, so no unhibernate checks "
				"will be made.\n");
		if( m_polling_timer != -1 ) {
			daemonCore->Cancel_Timer(m_polling_timer);
			m_polling_timer = -1;
		}
	}
	else if( m_polling_timer >= 0 ) {
		if( old_polling_interval != m_polling_interval ) {
			daemonCore->Reset_Timer(
				m_polling_timer,
				m_polling_interval,
				m_polling_interval);
		}
	}
	else {
		m_polling_timer = daemonCore->Register_Timer(
			m_polling_interval,
			m_polling_interval,
			(TimerHandlercpp)&Rooster::poll,
			"Rooster::poll",
			this );
	}
	if( old_polling_interval != m_polling_interval && m_polling_interval > 0 )
	{
		dprintf(D_ALWAYS,
				"Will perform unhibernate checks every ROOSTER_INTERVAL=%d "
				"seconds.\n", m_polling_interval);
	}

	ASSERT( param(m_unhibernate_constraint,"ROOSTER_UNHIBERNATE") );


	ASSERT( param(m_wakeup_cmd,"ROOSTER_WAKEUP_CMD") );

	m_wakeup_args.Clear();
	MyString error_msg;
	if( !m_wakeup_args.AppendArgsV2Quoted(m_wakeup_cmd.Value(),&error_msg) ) {
		EXCEPT("Invalid wakeup command %s: %s\n",
			   m_wakeup_cmd.Value(), error_msg.Value());
	}

	MyString rank;
	param(rank,"ROOSTER_UNHIBERNATE_RANK");
	if( rank.IsEmpty() ) {
		m_rank_ad.Delete(ATTR_RANK);
	}
	else {
		if( !m_rank_ad.AssignExpr(ATTR_RANK,rank.Value()) ) {
			EXCEPT("Invalid expression for ROOSTER_UNHIBERNATE_RANK: %s\n",
				   rank.Value());
		}
	}

	m_max_unhibernate = param_integer("ROOSTER_MAX_UNHIBERNATE",0,0);
}
예제 #21
0
void NordugridJob::doEvaluateState()
{
	int old_gm_state;
	bool reevaluate_state = true;
	time_t now = time(NULL);

	bool attr_exists;
	bool attr_dirty;
	int rc;

	daemonCore->Reset_Timer( evaluateStateTid, TIMER_NEVER );

    dprintf(D_ALWAYS,
			"(%d.%d) doEvaluateState called: gmState %s, condorState %d\n",
			procID.cluster,procID.proc,GMStateNames[gmState],condorState);

	if ( gahp ) {
		if ( !resourceStateKnown || resourcePingPending || resourceDown ) {
			gahp->setMode( GahpClient::results_only );
		} else {
			gahp->setMode( GahpClient::normal );
		}
	}

	do {
		reevaluate_state = false;
		old_gm_state = gmState;

		switch ( gmState ) {
		case GM_INIT: {
			// This is the state all jobs start in when the GlobusJob object
			// is first created. Here, we do things that we didn't want to
			// do in the constructor because they could block (the
			// constructor is called while we're connected to the schedd).
			if ( gahp->Startup() == false ) {
				dprintf( D_ALWAYS, "(%d.%d) Error starting GAHP\n",
						 procID.cluster, procID.proc );

				jobAd->Assign( ATTR_HOLD_REASON, "Failed to start GAHP" );
				gmState = GM_HOLD;
				break;
			}
			if ( gahp->Initialize( jobProxy ) == false ) {
				dprintf( D_ALWAYS, "(%d.%d) Error initializing GAHP\n",
						 procID.cluster, procID.proc );

				jobAd->Assign( ATTR_HOLD_REASON,
							   "Failed to initialize GAHP" );
				gmState = GM_HOLD;
				break;
			}

			gahp->setDelegProxy( jobProxy );

			gmState = GM_START;
			} break;
		case GM_START: {
			errorString = "";
			if ( remoteJobId == NULL ) {
				gmState = GM_CLEAR_REQUEST;
			} else {
				submitLogged = true;
				if ( condorState == RUNNING ||
					 condorState == COMPLETED ) {
					executeLogged = true;
				}

				if ( remoteJobState == "" ||
					 remoteJobState == REMOTE_STATE_ACCEPTING ||
					 remoteJobState == REMOTE_STATE_ACCEPTED ||
					 remoteJobState == REMOTE_STATE_PREPARING ) {
					gmState = GM_RECOVER_QUERY;
				} else {
					gmState = GM_SUBMITTED;
				}
			}
			} break;
		case GM_RECOVER_QUERY: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				if ( m_lastRemoteStatusUpdate > enteredCurrentGmState ) {
					if ( remoteJobState == REMOTE_STATE_ACCEPTING ||
						 remoteJobState == REMOTE_STATE_ACCEPTED ||
						 remoteJobState == REMOTE_STATE_PREPARING ) {
						gmState = GM_STAGE_IN;
					} else {
						gmState = GM_SUBMITTED;
					}
				} else if ( m_currentStatusUnknown ) {
					gmState = GM_CANCEL;
				}
			}
			} break;
		case GM_UNSUBMITTED: {
			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else if ( condorState == HELD ) {
				gmState = GM_DELETE;
				break;
			} else {
				gmState = GM_SUBMIT;
			}
			} break;
		case GM_SUBMIT: {
			if ( condorState == REMOVED || condorState == HELD ) {
				myResource->CancelSubmit( this );
				gmState = GM_UNSUBMITTED;
				break;
			}
			if ( numSubmitAttempts >= MAX_SUBMIT_ATTEMPTS ) {
//				jobAd->Assign( ATTR_HOLD_REASON,
//							   "Attempts to submit failed" );
				gmState = GM_HOLD;
				break;
			}
			// After a submit, wait at least submitInterval before trying
			// another one.
			if ( now >= lastSubmitAttempt + submitInterval ) {

				char *job_id = NULL;

				// Once RequestSubmit() is called at least once, you must
				// CancelRequest() once you're done with the request call
				if ( myResource->RequestSubmit( this ) == false ) {
					break;
				}

				if ( RSL == NULL ) {
					RSL = buildSubmitRSL();
				}
				if ( RSL == NULL ) {
					gmState = GM_HOLD;
					break;
				}
				rc = gahp->nordugrid_submit( 
										resourceManagerString,
										RSL->c_str(),
										job_id );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				}

				lastSubmitAttempt = time(NULL);
				numSubmitAttempts++;

				if ( rc == 0 ) {
					ASSERT( job_id != NULL );
					SetRemoteJobId( job_id );
					free( job_id );
					WriteGridSubmitEventToUserLog( jobAd );
					gmState = GM_SUBMIT_SAVE;
				} else {
					errorString = gahp->getErrorString();
					dprintf(D_ALWAYS,"(%d.%d) job submit failed: %s\n",
							procID.cluster, procID.proc,
							errorString.c_str() );
					myResource->CancelSubmit( this );
					gmState = GM_UNSUBMITTED;
				}

			} else {
				unsigned int delay = 0;
				if ( (lastSubmitAttempt + submitInterval) > now ) {
					delay = (lastSubmitAttempt + submitInterval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
			}
			} break;
		case GM_SUBMIT_SAVE: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				jobAd->GetDirtyFlag( ATTR_GRID_JOB_ID, &attr_exists, &attr_dirty );
				if ( attr_exists && attr_dirty ) {
					requestScheddUpdate( this, true );
					break;
				}
				gmState = GM_STAGE_IN;
			}
			} break;
		case GM_STAGE_IN: {
			if ( stageList == NULL ) {
				const char *file;
				stageList = buildStageInList();
				stageList->rewind();
				while ( (file = stageList->next()) ) {
					if ( IsUrl( file ) ) {
						stageList->deleteCurrent();
					}
				}
			}
			rc = gahp->nordugrid_stage_in( resourceManagerString, remoteJobId,
										   *stageList );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) file stage in failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				gmState = GM_SUBMITTED;
			}
			} break;
		case GM_SUBMITTED: {
			if ( remoteJobState == REMOTE_STATE_FINISHED ||
				 remoteJobState == REMOTE_STATE_FAILED ||
				 remoteJobState == REMOTE_STATE_KILLED ||
				 remoteJobState == REMOTE_STATE_DELETED ) {
					gmState = GM_EXIT_INFO;
			} else if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				if ( lastProbeTime < enteredCurrentGmState ) {
					lastProbeTime = enteredCurrentGmState;
				}
				if ( probeNow ) {
					lastProbeTime = 0;
					probeNow = false;
				}
/*
				int probe_interval = myResource->GetJobPollInterval();
				if ( now >= lastProbeTime + probe_interval ) {
					gmState = GM_PROBE_JOB;
					break;
				}
				unsigned int delay = 0;
				if ( (lastProbeTime + probe_interval) > now ) {
					delay = (lastProbeTime + probe_interval) - now;
				}				
				daemonCore->Reset_Timer( evaluateStateTid, delay );
*/
			}
			} break;
		case GM_PROBE_JOB: {
			if ( condorState == REMOVED || condorState == HELD ) {
				gmState = GM_CANCEL;
			} else {
				char *new_status = NULL;
				rc = gahp->nordugrid_status( resourceManagerString,
											 remoteJobId, new_status );
				if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
					 rc == GAHPCLIENT_COMMAND_PENDING ) {
					break;
				} else if ( rc != 0 ) {
					// What to do about failure?
					errorString = gahp->getErrorString();
					dprintf( D_ALWAYS, "(%d.%d) job probe failed: %s\n",
							 procID.cluster, procID.proc,
							 errorString.c_str() );
				} else {
					if ( new_status ) {
						remoteJobState = new_status;
					} else {
						remoteJobState = "";
					}
					SetRemoteJobStatus( new_status );
				}
				if ( new_status ) {
					free( new_status );
				}
				lastProbeTime = now;
				gmState = GM_SUBMITTED;
			}
			} break;
		case GM_EXIT_INFO: {
			std::string filter;
			StringList reply;
			formatstr( filter, "nordugrid-job-globalid=gsiftp://%s:2811/jobs/%s",
							resourceManagerString, remoteJobId );

			rc = gahp->nordugrid_ldap_query( resourceManagerString, "mds-vo-name=local,o=grid", filter.c_str(), "nordugrid-job-usedcputime,nordugrid-job-usedwalltime,nordugrid-job-exitcode", reply );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) exit info gathering failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				int exit_code = -1;
				int wallclock = -1;
				int cpu = -1;
				const char *entry;
				reply.rewind();
				while ( (entry = reply.next()) ) {
					if ( !strncmp( entry, "nordugrid-job-usedcputime: ", 27 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						cpu = atoi( entry );
					} else if ( !strncmp( entry, "nordugrid-job-usedwalltime: ", 28 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						wallclock = atoi( entry );
					} else if ( !strncmp( entry, "nordugrid-job-exitcode: ", 24 ) ) {
						entry = strchr( entry, ' ' ) + 1;
						exit_code = atoi( entry );
					}
				}
				if ( exit_code < 0 || wallclock < 0 || cpu < 0 ) {
					dprintf( D_ALWAYS, "(%d.%d) exit info missing\n",
							 procID.cluster, procID.proc );
					gmState = GM_CANCEL;
					break;
				}
				if ( exit_code > 128 ) {
					jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, true );
					jobAd->Assign( ATTR_ON_EXIT_SIGNAL, exit_code - 128 );
				} else {
					jobAd->Assign( ATTR_ON_EXIT_BY_SIGNAL, false );
					jobAd->Assign( ATTR_ON_EXIT_CODE, exit_code );
				}
				jobAd->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, wallclock * 60 );
				jobAd->Assign( ATTR_JOB_REMOTE_USER_CPU, cpu * 60 );
				gmState = GM_STAGE_OUT;
			}
			} break;
		case GM_STAGE_OUT: {
			if ( stageList == NULL ) {
				stageList = buildStageOutList();
			}
			if ( stageLocalList == NULL ) {
				const char *file;
				stageLocalList = buildStageOutLocalList( stageList );
				stageList->rewind();
				stageLocalList->rewind();
				while ( (file = stageLocalList->next()) ) {
					ASSERT( stageList->next() );
					if ( IsUrl( file ) ) {
						stageList->deleteCurrent();
						stageLocalList->deleteCurrent();
				}
				}
			}
			rc = gahp->nordugrid_stage_out2( resourceManagerString,
											 remoteJobId,
											 *stageList, *stageLocalList );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) file stage out failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_CANCEL;
			} else {
				gmState = GM_DONE_SAVE;
			}
			} break;
		case GM_DONE_SAVE: {
			if ( condorState != HELD && condorState != REMOVED ) {
				JobTerminated();
				if ( condorState == COMPLETED ) {
					jobAd->GetDirtyFlag( ATTR_JOB_STATUS, &attr_exists, &attr_dirty );
					if ( attr_exists && attr_dirty ) {
						requestScheddUpdate( this, true );
						break;
					}
				}
			}
			gmState = GM_DONE_COMMIT;
			} break;
		case GM_DONE_COMMIT: {
			rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc != 0 ) {
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) job cleanup failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_HOLD;
				break;
			}
			myResource->CancelSubmit( this );
			if ( condorState == COMPLETED || condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				// Clear the contact string here because it may not get
				// cleared in GM_CLEAR_REQUEST (it might go to GM_HOLD first).
				if ( remoteJobId != NULL ) {
					SetRemoteJobId( NULL );
				}
				gmState = GM_CLEAR_REQUEST;
			}
			} break;
		case GM_CANCEL: {
			rc = gahp->nordugrid_cancel( resourceManagerString, remoteJobId );
			if ( rc == GAHPCLIENT_COMMAND_NOT_SUBMITTED ||
				 rc == GAHPCLIENT_COMMAND_PENDING ) {
				break;
			} else if ( rc == 0 ) {
				gmState = GM_FAILED;
			} else {
				// What to do about a failed cancel?
				errorString = gahp->getErrorString();
				dprintf( D_ALWAYS, "(%d.%d) job cancel failed: %s\n",
						 procID.cluster, procID.proc, errorString.c_str() );
				gmState = GM_FAILED;
			}
			} break;
		case GM_FAILED: {
			myResource->CancelSubmit( this );
			SetRemoteJobId( NULL );

			if ( condorState == REMOVED ) {
				gmState = GM_DELETE;
			} else {
				gmState = GM_CLEAR_REQUEST;
			}
			} break;
		case GM_DELETE: {
			// The job has completed or been removed. Delete it from the
			// schedd.
			DoneWithJob();
			// This object will be deleted when the update occurs
			} break;
		case GM_CLEAR_REQUEST: {
			// Remove all knowledge of any previous or present job
			// submission, in both the gridmanager and the schedd.

			// If we are doing a rematch, we are simply waiting around
			// for the schedd to be updated and subsequently this globus job
			// object to be destroyed.  So there is nothing to do.
			if ( wantRematch ) {
				break;
			}

			// For now, put problem jobs on hold instead of
			// forgetting about current submission and trying again.
			// TODO: Let our action here be dictated by the user preference
			// expressed in the job ad.
			if ( remoteJobId != NULL
				     && condorState != REMOVED 
					 && wantResubmit == 0 
					 && doResubmit == 0 ) {
				gmState = GM_HOLD;
				break;
			}
			// Only allow a rematch *if* we are also going to perform a resubmit
			if ( wantResubmit || doResubmit ) {
				jobAd->EvalBool(ATTR_REMATCH_CHECK,NULL,wantRematch);
			}
			if ( wantResubmit ) {
				wantResubmit = 0;
				dprintf(D_ALWAYS,
						"(%d.%d) Resubmitting to Globus because %s==TRUE\n",
						procID.cluster, procID.proc, ATTR_GLOBUS_RESUBMIT_CHECK );
			}
			if ( doResubmit ) {
				doResubmit = 0;
				dprintf(D_ALWAYS,
					"(%d.%d) Resubmitting to Globus (last submit failed)\n",
						procID.cluster, procID.proc );
			}
			errorString = "";
			if ( remoteJobId != NULL ) {
				SetRemoteJobId( NULL );
			}
			JobIdle();
			if ( submitLogged ) {
				JobEvicted();
				if ( !evictLogged ) {
					WriteEvictEventToUserLog( jobAd );
					evictLogged = true;
				}
			}
			myResource->CancelSubmit( this );
			
			if ( wantRematch ) {
				dprintf(D_ALWAYS,
						"(%d.%d) Requesting schedd to rematch job because %s==TRUE\n",
						procID.cluster, procID.proc, ATTR_REMATCH_CHECK );

				// Set ad attributes so the schedd finds a new match.
				int dummy;
				if ( jobAd->LookupBool( ATTR_JOB_MATCHED, dummy ) != 0 ) {
					jobAd->Assign( ATTR_JOB_MATCHED, false );
					jobAd->Assign( ATTR_CURRENT_HOSTS, 0 );
				}

				// If we are rematching, we need to forget about this job
				// cuz we wanna pull a fresh new job ad, with a fresh new match,
				// from the all-singing schedd.
				gmState = GM_DELETE;
				break;
			}
			
			// If there are no updates to be done when we first enter this
			// state, requestScheddUpdate will return done immediately
			// and not waste time with a needless connection to the
			// schedd. If updates need to be made, they won't show up in
			// schedd_actions after the first pass through this state
			// because we modified our local variables the first time
			// through. However, since we registered update events the
			// first time, requestScheddUpdate won't return done until
			// they've been committed to the schedd.
			const char *name;
			ExprTree *expr;
			jobAd->ResetExpr();
			if ( jobAd->NextDirtyExpr(name, expr) ) {
				requestScheddUpdate( this, true );
				break;
			}
			if ( remoteJobState != "" ) {
				remoteJobState = "";
				SetRemoteJobStatus( NULL );
			}
			submitLogged = false;
			executeLogged = false;
			submitFailedLogged = false;
			terminateLogged = false;
			abortLogged = false;
			evictLogged = false;
			gmState = GM_UNSUBMITTED;
			} break;
		case GM_HOLD: {
			// Put the job on hold in the schedd.
			// TODO: what happens if we learn here that the job is removed?
			// If the condor state is already HELD, then someone already
			// HELD it, so don't update anything else.
			if ( condorState != HELD ) {

				// Set the hold reason as best we can
				// TODO: set the hold reason in a more robust way.
				char holdReason[1024];
				holdReason[0] = '\0';
				holdReason[sizeof(holdReason)-1] = '\0';
				jobAd->LookupString( ATTR_HOLD_REASON, holdReason,
									 sizeof(holdReason) );
				if ( holdReason[0] == '\0' && errorString != "" ) {
					strncpy( holdReason, errorString.c_str(),
							 sizeof(holdReason) - 1 );
				}
				if ( holdReason[0] == '\0' ) {
					strncpy( holdReason, "Unspecified gridmanager error",
							 sizeof(holdReason) - 1 );
				}

				JobHeld( holdReason );
			}
			gmState = GM_DELETE;
			} break;
		default:
			EXCEPT( "(%d.%d) Unknown gmState %d!", procID.cluster,procID.proc,
					gmState );
		}

		if ( gmState != old_gm_state ) {
			reevaluate_state = true;
			dprintf(D_FULLDEBUG, "(%d.%d) gm state change: %s -> %s\n",
					procID.cluster, procID.proc, GMStateNames[old_gm_state],
					GMStateNames[gmState]);
			enteredCurrentGmState = time(NULL);

			// If we were calling a gahp call that used RSL, we're done
			// with it now, so free it.
			if ( RSL ) {
				delete RSL;
				RSL = NULL;
			}
			if ( stageList ) {
				delete stageList;
				stageList = NULL;
			}
			if ( stageLocalList ) {
				delete stageLocalList;
				stageLocalList = NULL;
			}
		}

	} while ( reevaluate_state );
}
예제 #22
0
//---------------------------------------------------------------------------
void main_init (int argc, char ** const argv) {

	printf ("Executing condor dagman ... \n");

		// flag used if DAGMan is invoked with -WaitForDebug so we
		// wait for a developer to attach with a debugger...
	volatile int wait_for_debug = 0;

		// process any config vars -- this happens before we process
		// argv[], since arguments should override config settings
	dagman.Config();

	// The DCpermission (last parm) should probably be PARENT, if it existed
    daemonCore->Register_Signal( SIGUSR1, "SIGUSR1",
                                 (SignalHandler) main_shutdown_remove,
                                 "main_shutdown_remove", NULL);

/****** FOR TESTING *******
    daemonCore->Register_Signal( SIGUSR2, "SIGUSR2",
                                 (SignalHandler) main_testing_stub,
                                 "main_testing_stub", NULL);
****** FOR TESTING ********/
    debug_progname = condor_basename(argv[0]);

		// condor_submit_dag version from .condor.sub
	bool allowVerMismatch = false;
	const char *csdVersion = "undefined";

	int i;
    for (i = 0 ; i < argc ; i++) {
        debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] );
    }

    if (argc < 2) Usage();  //  Make sure an input file was specified

		// get dagman job id from environment, if it's there
		// (otherwise it will be set to "-1.-1.-1")
	dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) );

	dagman._dagmanClassad = new DagmanClassad( dagman.DAGManJobId );

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		// Minimum legal version for a .condor.sub file to be compatible
		// with this condor_dagman binary.

		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		// Be sure to change this if the arguments or environment
		// passed to condor_dagman change in an incompatible way!!
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	struct DagVersionData {
		int majorVer;
		int minorVer;
		int subMinorVer;
	};
	const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 };

		// Construct a string of the minimum submit file version.
	MyString minSubmitVersionStr;
	minSubmitVersionStr.formatstr( "%d.%d.%d",
				MIN_SUBMIT_FILE_VERSION.majorVer,
				MIN_SUBMIT_FILE_VERSION.minorVer,
				MIN_SUBMIT_FILE_VERSION.subMinorVer );
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    //
    // Process command-line arguments
    //
    for (i = 1; i < argc; i++) {
        if( !strcasecmp( "-Debug", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No debug level specified\n" );
                Usage();
            }
            debug_level = (debug_level_t) atoi (argv[i]);
        } else if( !strcasecmp( "-Lockfile", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" );
                Usage();
            }
            lockFileName = argv[i];
        } else if( !strcasecmp( "-Help", argv[i] ) ) {
            Usage();
        } else if (!strcasecmp( "-Dag", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DAG specified\n" );
                Usage();
            }
			dagman.dagFiles.append( argv[i] );
        } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxIdle\n" );
                Usage();
            }
            dagman.maxIdle = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxJobs\n" );
                Usage();
            }
            dagman.maxJobs = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) {
			debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with "
						   "-MaxPre and -MaxPost arguments\n" );
			Usage();
        } else if( !strcasecmp( "-MaxPre", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPre\n" );
                Usage();
            }
            dagman.maxPreScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxPost", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPost\n" );
                Usage();
            }
            dagman.maxPostScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) {
			debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is "
						"ignored; please use the DAGMAN_ALLOW_EVENTS "
						"config parameter instead\n");
			check_warning_strictness( DAG_STRICT_2 );

        } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) {
			dagman.allowLogError = true;

        } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) {
			dagman._runPost = false;

        } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) {
			wait_for_debug = 1;

        } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) {
			dagman.useDagDir = true;

        } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" );
                Usage();
            }
            dagman.autoRescue = (atoi( argv[i] ) != 0);

        } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" );
                Usage();
            }
            dagman.doRescueFrom = atoi (argv[i]);

        } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" );
                Usage();
            }
			csdVersion = argv[i];

        } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) {
			allowVerMismatch = true;

        } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) {
			dagman.dumpRescueDag = true;

        } else if( !strcasecmp( "-verbose", argv[i] ) ) {
			dagman._submitDagDeepOpts.bVerbose = true;

        } else if( !strcasecmp( "-force", argv[i] ) ) {
			dagman._submitDagDeepOpts.bForce = true;
		
        } else if( !strcasecmp( "-notification", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No notification value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strNotification = argv[i];

		} else if( !strcasecmp( "-suppress_notification",argv[i] ) ) {
			dagman._submitDagDeepOpts.suppress_notification = true;

		} else if( !strcasecmp( "-dont_suppress_notification",argv[i] ) ) {
			dagman._submitDagDeepOpts.suppress_notification = false;

        } else if( !strcasecmp( "-dagman", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No dagman value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strDagmanPath = argv[i];

        } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strOutfileDir = argv[i];

        } else if( !strcasecmp( "-update_submit", argv[i] ) ) {
			dagman._submitDagDeepOpts.updateSubmit = true;

        } else if( !strcasecmp( "-import_env", argv[i] ) ) {
			dagman._submitDagDeepOpts.importEnv = true;

        } else if( !strcasecmp( "-priority", argv[i] ) ) {
		++i;
		if( i >= argc || strcmp( argv[i], "" ) == 0 ) {
			debug_printf( DEBUG_NORMAL, "No priority value specified\n");
			Usage();
		}
		dagman._submitDagDeepOpts.priority = atoi(argv[i]);
		} else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) {
			dagman._submitDagDeepOpts.always_use_node_log = false;
        } else {
    		debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n",
						argv[i] );
			Usage();
		}
    }

	dagman.dagFiles.rewind();
	dagman.primaryDagFile = dagman.dagFiles.next();
	dagman.multiDags = (dagman.dagFiles.number() > 1);

	MyString tmpDefaultLog;
	if ( dagman._defaultNodeLog != NULL ) {
		tmpDefaultLog = dagman._defaultNodeLog;
		free( dagman._defaultNodeLog );
	} else {
		tmpDefaultLog = dagman.primaryDagFile + ".nodes.log";
	}

		// Force default log file path to be absolute so it works
		// with -usedagdir and DIR nodes.
	CondorError errstack;
	if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) {
       	debug_printf( DEBUG_QUIET, "Unable to convert default log "
					"file name to absolute path: %s\n",
					errstack.getFullText().c_str() );
		dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR );
		DC_Exit( EXIT_ERROR );
	}
	dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() );
	debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n",
				dagman._defaultNodeLog);

    //
    // Check the arguments
    //

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// Checking for version compatibility between the .condor.sub
	// file and this condor_dagman binary...

	// Note: if we're in recovery mode and the submit file version
	// causes us to quit, we leave any existing node jobs still
	// running -- may want to change that eventually.  wenger 2009-10-13.

		// Version of the condor_submit_dag that created our submit file.
	CondorVersionInfo submitFileVersion( csdVersion );

		// Version of this condor_dagman binary.
	CondorVersionInfo dagmanVersion;

		// Just generate this message fragment in one place.
	MyString versionMsg;
	versionMsg.formatstr("the version (%s) of this DAG's Condor submit "
				"file (created by condor_submit_dag)", csdVersion );

		// Make sure version in submit file is valid.
	if( !submitFileVersion.is_valid() ) {
		if ( !allowVerMismatch ) {
        	debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n",
						versionMsg.Value() );
			DC_Exit( EXIT_ERROR );
		} else {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; "
						"continuing because of -AllowVersionMismatch flag\n",
						versionMsg.Value() );
		}

		// Make sure .condor.sub file is recent enough.
	} else if ( submitFileVersion.compare_versions(
				CondorVersion() ) != 0 ) {

		if( !submitFileVersion.built_since_version(
					MIN_SUBMIT_FILE_VERSION.majorVer,
					MIN_SUBMIT_FILE_VERSION.minorVer,
					MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) {
			if ( !allowVerMismatch ) {
        		debug_printf( DEBUG_QUIET, "Error: %s is older than "
							"oldest permissible version (%s)\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
				DC_Exit( EXIT_ERROR );
			} else {
        		debug_printf( DEBUG_NORMAL, "Warning: %s is older than "
							"oldest permissible version (%s); continuing "
							"because of -AllowVersionMismatch flag\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
			}

			// Warn if .condor.sub file is a newer version than this binary.
		} else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is newer than "
						"condor_dagman version (%s)\n", versionMsg.Value(),
						CondorVersion() );
			check_warning_strictness( DAG_STRICT_3 );
		} else {
        	debug_printf( DEBUG_NORMAL, "Note: %s differs from "
						"condor_dagman version (%s), but the "
						"difference is permissible\n", 
						versionMsg.Value(), CondorVersion() );
		}
	}
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    if( dagman.primaryDagFile == "" ) {
        debug_printf( DEBUG_SILENT, "No DAG file was specified\n" );
        Usage();
    }
    if (lockFileName == NULL) {
        debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" );
        Usage();
    }
    if( dagman.maxJobs < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n");
        Usage();
    }
    if( dagman.maxPreScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" );
        Usage();
    }
    if( dagman.maxPostScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" );
        Usage();
    }
    if( dagman.doRescueFrom < 0 ) {
        debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" );
        Usage();
    }

    debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n",
                   lockFileName );
	if ( dagman.dagFiles.number() == 1 ) {
    	debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n",
				  	dagman.primaryDagFile.Value() );
	} else {
		MyString msg = "DAG Input files are ";
		dagman.dagFiles.rewind();
		const char *dagFile;
		while ( (dagFile = dagman.dagFiles.next()) != NULL ) {
			msg += dagFile;
			msg += " ";
		}
		msg += "\n";
    	debug_printf( DEBUG_VERBOSE, "%s", msg.Value() );
	}

		// if requested, wait for someone to attach with a debugger...
	while( wait_for_debug ) { }

    {
		MyString cwd;
		if( !condor_getcwd(cwd) ) {
			cwd = "<null>";
		}
        debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value());

		char *temp = my_username();
		debug_printf( DEBUG_DEBUG_1, "Current user is %s\n",
					   temp ? temp : "<null>" );
		if( temp ) {
			free( temp );
		}
    }

		//
		// Figure out the rescue DAG to run, if any (this is with "new-
		// style" rescue DAGs).
		//
	int rescueDagNum = 0;
	MyString rescueDagMsg;

	if ( dagman.doRescueFrom != 0 ) {
		rescueDagNum = dagman.doRescueFrom;
		rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum );
		RenameRescueDagsAfter( dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum );

	} else if ( dagman.autoRescue ) {
		rescueDagNum = FindLastRescueDagNum(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum );
		rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum );
	}

		//
		// Fill in values in the deep submit options that we haven't
		// already set.
		//
	dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError;
	dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir;
	dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue;
	dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom;
	dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch;
	dagman._submitDagDeepOpts.recurse = false;

    //
    // Create the DAG
    //

	// Note: a bunch of the parameters we pass here duplicate things
	// in submitDagOpts, but I'm keeping them separate so we don't have to
	// bother to construct a new SubmitDagOtions object for splices.
	// wenger 2010-03-25
    dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs,
						  dagman.maxPreScripts, dagman.maxPostScripts,
						  dagman.allowLogError, dagman.useDagDir,
						  dagman.maxIdle, dagman.retrySubmitFirst,
						  dagman.retryNodeFirst, dagman.condorRmExe,
						  dagman.storkRmExe, &dagman.DAGManJobId,
						  dagman.prohibitMultiJobs, dagman.submitDepthFirst,
						  dagman._defaultNodeLog,
						  dagman._generateSubdagSubmits,
						  &dagman._submitDagDeepOpts,
						  false ); /* toplevel dag! */

    if( dagman.dag == NULL ) {
        EXCEPT( "ERROR: out of memory!\n");
    }

	dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit );
	dagman.dag->SetAllowEvents( dagman.allow_events );
	dagman.dag->SetConfigFile( dagman._dagmanConfigFile );
	dagman.dag->SetMaxJobHolds( dagman._maxJobHolds );
	dagman.dag->SetPostRun(dagman._runPost);
	if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line
		dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority);
	} else if( dagman._defaultPriority != 0 ) { // From config file
		dagman.dag->SetDefaultPriority(dagman._defaultPriority);
		dagman._submitDagDeepOpts.priority = dagman._defaultPriority;
	}

    //
    // Parse the input files.  The parse() routine
    // takes care of adding jobs and dependencies to the DagMan
    //
	dagman.mungeNodeNames = (dagman.dagFiles.number() > 1);
	parseSetDoNameMunge( dagman.mungeNodeNames );
   	debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", 
		dagman.dagFiles.number() );
	dagman.dagFiles.rewind();
	char *dagFile;

	// Here we make a copy of the dagFiles for iteration purposes. Deep inside
	// of the parsing, copies of the dagman.dagFile string list happen which
	// mess up the iteration of this list.
	StringList sl( dagman.dagFiles );
	sl.rewind();
	while ( (dagFile = sl.next()) != NULL ) {
    	debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile );

    	if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							false, true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			tolerant_unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}
	if( dagman.dag->GetDefaultPriority() != 0 ) {
		dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag
	}
	dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId );
	if ( rescueDagNum > 0 ) {
			// Get our Pegasus sequence numbers set correctly.
		dagman.dag->GetJobstateLog().InitializeRescue();
	}

	// lift the final set of splices into the main dag.
	dagman.dag->LiftSplices(SELF);

		//
		// Actually parse the "new-new" style (partial DAG info only)
		// rescue DAG here.  Note: this *must* be done after splices
		// are lifted!
		//
	if ( rescueDagNum > 0 ) {
		dagman.rescueFileToRun = RescueDagName(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum );
		debug_printf ( DEBUG_QUIET, "%s; running %s in combination with "
					"normal DAG file%s\n", rescueDagMsg.Value(),
					dagman.rescueFileToRun.Value(),
					dagman.multiDags ? "s" : "");
		debug_printf ( DEBUG_QUIET,
					"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");

		debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n",
					dagman.rescueFileToRun.Value() );

			// Turn off node name munging for the rescue DAG, because
			// it will already have munged node names.
		parseSetDoNameMunge( false );

    	if( !parse( dagman.dag, dagman.rescueFileToRun.Value(),
					dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			tolerant_unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}

	dagman.dag->CheckThrottleCats();

	// fix up any use of $(JOB) in the vars values for any node
	dagman.dag->ResolveVarsInterpolations();

/*	debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/
/*	dagman.dag->PrintJobList();*/

#ifndef NOT_DETECT_CYCLE
	if( dagman.startup_cycle_detect && dagman.dag->isCycle() )
	{
		// Note: maybe we should run the final node here, if there is one.
		// wenger 2011-12-19.
		debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n");
	}
#endif
    debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n",
				  dagman.dag->NumNodes( true ) );

	MyString firstLocation;
	if ( dagman.dag->GetReject( firstLocation ) ) {
    	debug_printf( DEBUG_QUIET, "Exiting because of REJECT "
					"specification in %s.  This most likely means "
					"that the DAG file was produced with the -DumpRescue "
					"flag when parsing the original DAG failed.\n",
					firstLocation.Value() );
		DC_Exit( EXIT_ERROR );
		return;
	}

	dagman.dag->DumpDotFile();

	if ( dagman.dumpRescueDag ) {
    	debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting "
					"because of -DumpRescue flag\n" );
		dagman.dag->Rescue( dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum, false,
					false, false );
		ExitSuccess();
		return;
	}

    //------------------------------------------------------------------------
    // Bootstrap and Recovery
    //
    // If the Lockfile exists, this indicates a premature termination
    // of a previous run of Dagman. If condor log is also present,
    // we run in recovery mode
  
    // If the Daglog is not present, then we do not run in recovery
    // mode
  
    {
      bool recovery = access(lockFileName,  F_OK) == 0;
      
        if (recovery) {
            debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n",
                           lockFileName);
			if (dagman.abortDuplicates) {
				if (util_check_lock_file(lockFileName) == 1) {
        			debug_printf( DEBUG_QUIET, "Aborting because it "
							"looks like another instance of DAGMan is "
							"currently running on this DAG; if that is "
							"not the case, delete the lock file (%s) "
							"and re-submit the DAG.\n", lockFileName );
					dagman.dag->GetJobstateLog().
								WriteDagmanFinished( EXIT_RESTART );
    				dagman.CleanUp();
					DC_Exit( EXIT_ERROR );
					// We should never get to here!
				}
			}
        }

			//
			// If this DAGMan continues, it should overwrite the lock
			// file if it exists.
			//
		util_create_lock_file(lockFileName, dagman.abortDuplicates);

        debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n");
        if( !dagman.dag->Bootstrap( recovery ) ) {
            dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
            debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n");
        }
		print_status();
    }

    debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" );
    daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, 
				condor_event_timer, "condor_event_timer" );

	dagman.dag->SetPendingNodeReportInterval(
				dagman.pendingReportInterval );
}
ClassAd * CollectorEngine::
updateClassAd (CollectorHashTable &hashTable,
			   const char *adType,
			   const char *label,
			   ClassAd *ad,
			   AdNameHashKey &hk,
			   const MyString &hashString,
			   int  &insert,
			   const condor_sockaddr& /*from*/ )
{
	ClassAd		*old_ad, *new_ad;
	MyString	buf;
	time_t		now;

		// NOTE: LastHeardFrom will already be in ad if we are loading
		// adds from the offline classad collection, so don't mess with
		// it if it is already there
	if( !ad->LookupExpr(ATTR_LAST_HEARD_FROM) ) {
		(void) time (&now);
		if (now == (time_t) -1)
		{
			EXCEPT ("Error reading system time!");
		}	
		buf.sprintf( "%s = %d", ATTR_LAST_HEARD_FROM, (int)now);
		ad->Insert ( buf.Value() );
	}

	// this time stamped ad is the new ad
	new_ad = ad;

	// check if it already exists in the hash table ...
	if ( hashTable.lookup (hk, old_ad) == -1)
    {	 	
		// no ... new ad
		dprintf (D_ALWAYS, "%s: Inserting ** \"%s\"\n", adType, hashString.Value() );

		// Update statistics
		collectorStats->update( label, NULL, new_ad );

		// Now, store it away
		if (hashTable.insert (hk, new_ad) == -1)
		{
			EXCEPT ("Error inserting ad (out of memory)");
		}
		
		insert = 1;
		
		return new_ad;
	}
	else
    {
		// yes ... old ad must be updated
		dprintf (D_FULLDEBUG, "%s: Updating ... \"%s\"\n", adType, hashString.Value() );

		// Update statistics
		collectorStats->update( label, old_ad, new_ad );

		// Now, finally, store the new ClassAd
		if (hashTable.remove(hk) == -1) {
			EXCEPT( "Error removing ad" );
		}
		if (hashTable.insert(hk, new_ad) == -1) {
			EXCEPT( "Error inserting ad" );
		}

		delete old_ad;

		insert = 0;
		return new_ad;
	}
}
예제 #24
0
// copy ctor; makes deep copy
CondorQuery::
CondorQuery (const CondorQuery & /* from */)
{
		// Unimplemented!
	EXCEPT( "CondorQuery copy constructor called, but unimplemented!" );
}
예제 #25
0
void
TranslateClient::hookExited(int exit_status)
{
	std::string key = m_routed_job->src_key;
	if (false == JobRouterHookMgr::removeKnownHook(key.c_str(), HOOK_TRANSLATE_JOB))
	{
		dprintf(D_ALWAYS|D_FAILURE, "TranslateClient::hookExited (%s):"
			"Failed to remove hook info for job key %s.\n", 
			m_routed_job->JobDesc().c_str(), key.c_str());
		EXCEPT("TranslateClient::hookExited: Received exit "
			"notification for job with key %s, which isn't a key "
			"for a job known to have a translate hook running.",
			 key.c_str());
		return;
	}

	HookClient::hookExited(exit_status);

	if (m_std_err.Length())
	{
		dprintf(D_ALWAYS, "TranslateClient::hookExited (%s): "
				"Warning, hook %s (pid %d) printed to stderr: "
				"%s\n", m_routed_job->JobDesc().c_str(), 
				m_hook_path, (int)m_pid, m_std_err.Value());
	}
	if (m_std_out.Length() && 0 == WEXITSTATUS(exit_status))
	{
		ClassAd job_ad;
		const char* hook_line = NULL;

		m_std_out.Tokenize();
		while ((hook_line = m_std_out.GetNextToken("\n", true)))
		{
			if (!job_ad.Insert(hook_line))
			{
				dprintf(D_ALWAYS, "TranslateClient::hookExited "
						"(%s): Failed to insert \"%s\" "
						"into ClassAd, ignoring "
						"invalid hook output\n", 
						m_routed_job->JobDesc().c_str(),
						hook_line);
				job_router->GracefullyRemoveJob(m_routed_job);
				return;
			}
		}
		m_routed_job->dest_ad = job_ad;
	}
	else
	{
		if (0 == WEXITSTATUS(exit_status))
		{
			dprintf(D_ALWAYS, "TranslateClient::hookExited (%s): "
					"Hook %s (pid %d) returned no data.\n",
					m_routed_job->JobDesc().c_str(), 
					m_hook_path, (int)m_pid);
		}
		else
		{
			dprintf(D_ALWAYS, "TranslateClient::hookExited (%s): "
					"Hook %s (pid %d) exited with return "
					"status %d.  Ignoring output.\n", 
					m_routed_job->JobDesc().c_str(),
					m_hook_path, (int)m_pid,
					(int)WEXITSTATUS(exit_status));
			job_router->GracefullyRemoveJob(m_routed_job);
		}
		return;
	}

	job_router->FinishSubmitJob(m_routed_job);
}
예제 #26
0
int AutoCluster::getAutoClusterid( ClassAd *job )
{
	int cur_id = -1;

		// first check if condor_config file even desires this
		// functionality...
	if ( !significant_attrs ) {
		return -1;
	}

    job->LookupInteger(ATTR_AUTO_CLUSTER_ID, cur_id);
	if ( cur_id != -1 ) {
			// we've previously figured it out...
		
			// tag it as touched
		cluster_in_use.insert(cur_id);

		return cur_id;
	}

		// summarize job into a string "signature"
		// first put significant attrs from target into the signature
	std::string signature;
	char *buf;
	significant_attrs->rewind();
	const char* next_attr = NULL;
	while ( (next_attr=significant_attrs->next()) != NULL ) {
		buf = NULL;
		buf = sPrintExpr(*job, next_attr);
		if (buf) {
			signature += buf;
			free(buf);
		}
	}
		// now put significant attrs from self into the signature.
		// note: only do this if significant_attributes is not explicitly
		// defined in our config file; if it is in our condor_config, then
		// we only want to consider the attributes listed by the admin.
	StringList internal_refs;	// this is what we want to know
	if ( !sig_attrs_came_from_config_file ) {
		// get all internal references in the job ad.
		StringList external_refs;	// we do not care about these
		job->GetReferences(ATTR_REQUIREMENTS,internal_refs,external_refs);
		internal_refs.remove_anycase(ATTR_CURRENT_TIME);	// never want this attr
		internal_refs.append(ATTR_REQUIREMENTS);	// always want these attrs
		internal_refs.append(ATTR_NICE_USER);
		internal_refs.append(ATTR_CONCURRENCY_LIMITS);

		internal_refs.rewind();
		next_attr = NULL;
		while ( (next_attr=internal_refs.next()) != NULL ) {
				// skip this attr if already in our signature from above...
			if ( significant_attrs->contains_anycase(next_attr) ) {
				internal_refs.deleteCurrent();
				continue;
			}
			buf = NULL;
			buf = sPrintExpr(*job, next_attr);
			if (buf) {
				signature += buf;
				free(buf);
			}
		}
	}

		// try to find a fit
	AutoClusterMap::iterator it;
	it = cluster_map.find(signature);
	if( it != cluster_map.end() ) {
		cur_id = it->second;
	}
	else {
		cur_id = next_id++;
		if( cur_id < 0 ) {
				// We've wrapped around MAX_INT!
				// In config() we take steps to avoid this unlikely condition.
			EXCEPT("Auto cluster IDs exhausted! (allocated %d)",cur_id);
		}

		cluster_map.insert(AutoClusterMap::value_type(signature,cur_id));
	}

		// put the new auto cluster id into the job ad to cache it.
	job->Assign(ATTR_AUTO_CLUSTER_ID,cur_id);

		// tag it as touched
	cluster_in_use.insert(cur_id);

		// for some nice feedback, place the final list of attrs used to create this
		// signature into the job ad.
		// the ATTR_AUTO_CLUSTER_ATTRS attribute is also used by SetAttribute()
		// in qmgmt -- if any of the attrs used to create the signature are
		// changed, then SetAttribute() will delete the ATTR_AUTO_CLUSTER_ID, since
		// the signature needs to be recomputed as it may have changed.
	MyString final_list;
	final_list += ATTR_AUTO_CLUSTER_ATTRS;
	final_list += "=\"";
	char *tmp;
	bool need_comma = false;
	tmp = significant_attrs->print_to_string();
	if (tmp) {
		final_list += tmp;
		need_comma = true;
		free(tmp);
	}
	tmp = internal_refs.print_to_string();
	if (tmp) {
		if ( need_comma ) {
			final_list += ',';
		}
		final_list += tmp;
		free(tmp);
	}
	final_list += "\"";
	job->Insert(final_list.Value());


	return cur_id;
}
예제 #27
0
void
ExitClient::hookExited(int exit_status) {
	std::string key = m_routed_job->dest_key;
	if (false == JobRouterHookMgr::removeKnownHook(key.c_str(), HOOK_JOB_EXIT))
	{
		dprintf(D_ALWAYS|D_FAILURE, "ExitClient::hookExited (%s): "
			"Failed to remove hook info for job key %s.\n",
			m_routed_job->JobDesc().c_str(), key.c_str());
		EXCEPT("ExitClient::hookExited: Received exit notification for "
			"job with key %s, which isn't a key for a job known "
			"to have an exit hook running.", key.c_str());
		return;
	}

	HookClient::hookExited(exit_status);

	if (m_std_err.Length())
	{
		dprintf(D_ALWAYS, "ExitClient::hookExited (%s): Warning, hook "
				"%s (pid %d) printed to stderr: %s\n",
				m_routed_job->JobDesc().c_str(), m_hook_path,
				(int)m_pid, m_std_err.Value());
	}
	if (m_std_out.Length())
	{
		if (0 == WEXITSTATUS(exit_status))
		{
			ClassAd job_ad;
			const char* hook_line = NULL;
			classad::ClassAdCollection *ad_collection = job_router->GetScheduler()->GetClassAds();
			classad::ClassAd *orig_ad = ad_collection->GetClassAd(m_routed_job->src_key);

			m_std_out.Tokenize();
			while ((hook_line = m_std_out.GetNextToken("\n", true)))
			{
				if (!job_ad.Insert(hook_line))
				{
					dprintf(D_ALWAYS, "ExitClient::hookExited (%s): "
							"Failed to insert \"%s\" into "
							"ClassAd, ignoring invalid "
							"hook output.  Job status NOT updated.\n", m_routed_job->JobDesc().c_str(), hook_line);
					job_router->RerouteJob(m_routed_job);
					return;
				}
			}
			if (false == m_routed_job->src_ad.Update(job_ad))
			{
				dprintf(D_ALWAYS, "ExitClient::hookExited (%s):"
						" Failed to update source job "
						"status.  Job status NOT "
						"updated.\n", m_routed_job->JobDesc().c_str());
				m_routed_job->SetSrcJobAd(m_routed_job->src_key.c_str(), orig_ad, ad_collection);
				job_router->RerouteJob(m_routed_job);
				return;
			}
			if (false == job_router->PushUpdatedAttributes(m_routed_job->src_ad))
			{
				dprintf(D_ALWAYS,"ExitClient::hookExited (%s): "
						"Failed to update src job in "
						"job queue.  Job status NOT "
						"updated.\n", m_routed_job->JobDesc().c_str());
				m_routed_job->SetSrcJobAd(m_routed_job->src_key.c_str(), orig_ad, ad_collection);
				job_router->RerouteJob(m_routed_job);
				return;
			}
			else
			{
				dprintf(D_FULLDEBUG,"ExitClient::hookExited "
						"(%s): updated src job\n",
						m_routed_job->JobDesc().c_str());
			}
		}
		else
		{
			dprintf(D_FULLDEBUG, "ExitClient::hookExited (%s): "
					"Hook exited with non-zero return "
					"code, ignoring hook output.\n",
					m_routed_job->JobDesc().c_str());
		}
	}

	// If the exit hook exited with non-zero status, tell the JobRouter to
	// re-route the job.
	if (0 != WEXITSTATUS(exit_status))
	{
		// Tell the JobRouter to reroute the job.
		job_router->RerouteJob(m_routed_job);
	}
	else
	{
		// Tell the JobRouter to finalize the job.
		job_router->FinishFinalizeJob(m_routed_job);
	}
}
예제 #28
0
void
init_network_interfaces( int config_done )
{
	dprintf( D_HOSTNAME, "Trying to getting network interface informations (%s)\n",
		 config_done ? "after reading config" : "config file not read" );

	std::string network_interface;

	if( config_done ) {
		param(network_interface,"NETWORK_INTERFACE");
	}
	if( network_interface.empty() ) {
		network_interface = "*";
	}

	network_interface_matches_all = (network_interface == "*");

	if( param_false( "ENABLE_IPV4" ) && param_false( "ENABLE_IPV6" ) ) {
		EXCEPT( "ENABLE_IPV4 and ENABLE_IPV6 are both false." );
	}

	std::string network_interface_ipv4;
	std::string network_interface_ipv6;
	std::string network_interface_best;
	bool ok;
	ok = network_interface_to_ip(
		"NETWORK_INTERFACE",
		network_interface.c_str(),
		network_interface_ipv4,
		network_interface_ipv6,
		network_interface_best,
		&configured_network_interface_ips);

	if( !ok ) {
		EXCEPT("Failed to determine my IP address using NETWORK_INTERFACE=%s",
			   network_interface.c_str());
	}

	//
	// Check the validity of the configuration.
	//
	if( network_interface_ipv4.empty() && param_true( "ENABLE_IPV4" ) ) {
		EXCEPT( "ENABLE_IPV4 is TRUE, but no IPv4 address was detected.  Ensure that your NETWORK_INTERFACE parameter is not set to an IPv6 address." );
	}
	// We don't have an enum type in the param system (yet), so check.
	if( (!param_true( "ENABLE_IPV4" )) && (!param_false( "ENABLE_IPV4" )) ) {
		if( strcasecmp( param( "ENABLE_IPV4" ), "AUTO" ) ) {
			EXCEPT( "ENABLE_IPV4 is '%s', must be 'true', 'false', or 'auto'.", param( "ENABLE_IPV4" ) );
		}
	}

	if( network_interface_ipv6.empty() && param_true( "ENABLE_IPV6" ) ) {
		EXCEPT( "ENABLE_IPV6 is TRUE, but no IPv6 address was detected.  Ensure that your NETWORK_INTERFACE parameter is not set to an IPv4 address." );
	}
	// We don't have an enum type in the param system (yet), so check.
	if( (!param_true( "ENABLE_IPV6" )) && (!param_false( "ENABLE_IPV6" )) ) {
		if( strcasecmp( param( "ENABLE_IPV6" ), "AUTO" ) ) {
			EXCEPT( "ENABLE_IPV6 is '%s', must be 'true', 'false', or 'auto'.", param( "ENABLE_IPV6" ) );
		}
	}

	if( (!network_interface_ipv4.empty()) && param_false( "ENABLE_IPV4" ) ) {
		EXCEPT( "ENABLE_IPV4 is false, yet we found an IPv4 address.  Ensure that NETWORK_INTERFACE is set appropriately." );
	}

	if( (!network_interface_ipv6.empty()) && param_false( "ENABLE_IPV6" ) ) {
		EXCEPT( "ENABLE_IPV6 is false, yet we found an IPv6 address.  Ensure that NETWORK_INTERFACE is set appropriately." );
	}

}
예제 #29
0
int
SharedPortClient::PassSocket(Sock *sock_to_pass,char const *shared_port_id,char const *requested_by, bool non_blocking)
{
#ifndef HAVE_SHARED_PORT

	dprintf(D_ALWAYS,"SharedPortClient::PassSocket() not supported on this platform\n");
	SharedPortClient::m_failPassSocketCalls++;
	return FALSE;

#elif WIN32

	/* Handle Windows */

	if( !SharedPortIdIsValid(shared_port_id) ) {
		dprintf(D_ALWAYS,
				"ERROR: SharedPortClient: refusing to connect to shared port"
				"%s, because specified id is illegal! (%s)\n",
				requested_by, shared_port_id );
		SharedPortClient::m_failPassSocketCalls++;
		return FALSE;
	}

	std::string pipe_name;
	SharedPortEndpoint::GetDaemonSocketDir(pipe_name);
	formatstr_cat(pipe_name, "%c%s", DIR_DELIM_CHAR, shared_port_id);

	MyString requested_by_buf;
	if( !requested_by ) {
		requested_by_buf.formatstr(
			" as requested by %s", sock_to_pass->peer_description());
		requested_by = requested_by_buf.Value();
	}

	HANDLE child_pipe;
	
	while(true)
	{
		child_pipe = CreateFile(
			pipe_name.c_str(),
			GENERIC_READ | GENERIC_WRITE,
			0,
			NULL,
			OPEN_EXISTING,
			0,
			NULL);

		if(child_pipe != INVALID_HANDLE_VALUE)
			break;

		if(GetLastError() == ERROR_PIPE_BUSY)
		{
			dprintf(D_FULLDEBUG, "SharedPortClient: pipe id '%s' %s is busy, waiting\n", shared_port_id, requested_by);
		#if 1 // tj: this *might*? make a difference?
			bool timeout = true;
			for (int ii = 0; ii < 5; ++ii) {
				if (WaitNamedPipe(pipe_name.c_str(), 3 * 1000)) { timeout = false; break; }
				DWORD err = GetLastError();
				dprintf(D_FULLDEBUG, "SharedPortClient: pipe id '%s' %s wait returned %d\n", shared_port_id, requested_by, err);
			}
			if (timeout)
		#else
			if (!WaitNamedPipe(pipe_name.c_str(), 20 * 1000))
		#endif
			{
				DWORD err = GetLastError();
				dprintf(D_ALWAYS, "ERROR: SharedPortClient: Wait for named pipe id '%s' %s for sending failed: %d %s\n",
					shared_port_id, requested_by, err, GetLastErrorString(err));
				SharedPortClient::m_failPassSocketCalls++;
				return FALSE;
			}
			dprintf(D_FULLDEBUG, "SharedPortClient: wait for pipe id '%s' %s succeeded.\n", shared_port_id, requested_by);
		}
		else
		{
			DWORD err = GetLastError();
			dprintf(D_ALWAYS, "ERROR: SharedPortClient: Failed to open named pipe id '%s' %s for sending socket: %d %s\n", 
				shared_port_id, requested_by, err, GetLastErrorString(err));
			SharedPortClient::m_failPassSocketCalls++;
			return FALSE;
		}
	}

	DWORD child_pid;
	DWORD read_bytes = 0;

	BOOL read_result = ReadFile(child_pipe, &child_pid, sizeof(DWORD), &read_bytes, NULL);

	if(!read_result)
	{
		DWORD last_error = GetLastError();
		dprintf(D_ALWAYS, "ERROR: SharedPortClient: Failed to read PID from pipe: %d.\n", last_error);
		CloseHandle(child_pipe);
		SharedPortClient::m_failPassSocketCalls++;
		return FALSE;
	}
	else
	{
		dprintf(D_FULLDEBUG, "SharedPortClient: Read PID: %d\n", child_pid);
	}

	#pragma pack(push, 4)
	struct {
		int id; // condor commmand id
		WSAPROTOCOL_INFO wsa; // payload.
	} protocol_command;
	#pragma pack(pop)
	ZeroMemory(&protocol_command, sizeof(protocol_command));

	int dup_result = WSADuplicateSocket(sock_to_pass->get_file_desc(), child_pid, &protocol_command.wsa);
	if(dup_result == SOCKET_ERROR)
	{
		dprintf(D_ALWAYS, "ERROR: SharedPortClient: Failed to duplicate socket.\n");
		CloseHandle(child_pipe);
		SharedPortClient::m_failPassSocketCalls++;
		return FALSE;
	}

	protocol_command.id = SHARED_PORT_PASS_SOCK;
	BOOL write_result = WriteFile(child_pipe, &protocol_command, sizeof(protocol_command), &read_bytes, 0);

	if(!write_result)
	{
		dprintf(D_ALWAYS, "ERROR: SharedPortClient: Failed to send WSAPROTOCOL_INFO struct: %d\n", GetLastError());
		CloseHandle(child_pipe);
		SharedPortClient::m_failPassSocketCalls++;
		return FALSE;
	}
	dprintf(D_FULLDEBUG, "SharedPortClient: Wrote %d bytes to named pipe.\n", read_bytes);
	FlushFileBuffers(child_pipe);

	CloseHandle(child_pipe);

	SharedPortClient::m_successPassSocketCalls++;
	return TRUE;

#elif HAVE_SCM_RIGHTS_PASSFD

	/* Handle most (all?) Linux/Unix and MacOS platforms */

	SharedPortState * state = new SharedPortState(static_cast<ReliSock*>(sock_to_pass),
									shared_port_id, requested_by, non_blocking);

	int result = state->Handle();

	switch (result) 
	{
	case KEEP_STREAM:
		// pass thru so that we return KEEP_STREAM; the PassSocket call is
		// pending, we want to keep the passed socket open until we get an 
		// ack from endpoint.  
		ASSERT( non_blocking ); // should only get KEEP_STREAM if non_blocking is true
		break;
	case SharedPortState::FAILED:
		result = FALSE;
		break;
	case SharedPortState::DONE:
		result = TRUE;
		break;
	case SharedPortState::CONTINUE:
	case SharedPortState::WAIT:
	default:
		// coding logic error if Handle() returns anything else
		EXCEPT("ERROR SharedPortState::Handle() unexpected return code %d",result);
		break;
	}

	return result;

#else

#error HAVE_SHARED_PORT is defined, but no method for passing fds is enabled.

#endif
}
예제 #30
0
/**
 * A job is exiting the Starter and we need to take necessary
 * actions. First we will update the job's ad file with various
 * information about what the job did. Next, if the job completed on
 * its own, we'll want to call the StarterUserPolicy's checkAtExit(),
 * which handles setting the right exit status to control the job's
 * final state in the job queue. If the job is being killed from "unnatural"
 * causes, such as a condor_rm, then we will figure out the right
 * update type is for the job and write an EVICT event to the user log.
 * 
 * @param exit_status - the exit status of the job from wait()
 * This is not used currently
 * @param reason - the Condor-defined reason why the job is exiting
 * @param user_proc - the Proc object for this job
 * @return true if the job was set to exit properly
 * @see h/exit.h
 **/
bool
JICLocalSchedd::notifyJobExit( int, int reason, 
							   UserProc* user_proc )
{
		// Remember what steps we've completed, in case we need to retry.
	static bool did_final_ad_publish = false;
	static bool did_schedd_update = false;
	static bool did_check_at_exit = false;
	static bool did_ulog_event = false;

	m_tried_notify_job_exit = true;
 
	if (!did_final_ad_publish) {
			// Prepare to update the job queue.  In this case, we want
			// to publish all the same attribute we'd otherwise send
			// to the shadow, but instead, just stick them directly
			// into our copy of the job classad.
		Starter->publishPreScriptUpdateAd( job_ad );
		if( user_proc ) {
			user_proc->PublishUpdateAd( job_ad );
		}
		Starter->publishPostScriptUpdateAd( job_ad );
		did_final_ad_publish = true;
	}
	
		// Only check to see what we should do with our job 
		// in the user policy object if the job terminated
		// on its own.  Otherwise, we've already been there
		// and done that.
	if ( reason == JOB_EXITED || reason == JOB_COREDUMPED ) {
		if( !did_check_at_exit ) {
				// What should be the return value for this?
				// Can I just assume that things went well?
			this->starter_user_policy->checkAtExit( );
			did_check_at_exit = true;
		}
	}
	else if( reason == JOB_MISSED_DEFERRAL_TIME ) {
			//
			// This is suppose to be temporary until we have some kind
			// of error handling in place for jobs that never started
			// Andy Pavlo - 01.24.2006 - [email protected]
			//
		exit_code = JOB_MISSED_DEFERRAL_TIME;
	}

	if( !did_ulog_event ) {
			// Use the final exit code to determine what event to log.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		switch( this->exit_code ) {
		case JOB_EXITED:
			this->u_log->logTerminate( this->job_ad );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REQUEUE:
			// Following the baseshadow, if the job is being requeued
			// then it is an eviction event
			this->u_log->logRequeueEvent( this->job_ad, false );
			did_ulog_event = true;
			break;
		case JOB_SHOULD_REMOVE:
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			// NOTE: The local universe's log actions are not consistent
			// with what the Shadow does. This is because the Shadow is
			// not consistent with itself; for example, a condor_rm
			// will cause an EVICT notice in the user log, but a 
			// periodic remove will not. This is something Derek
			// said he will clean up later on. For now, however, we are
			// going to be consistent with ourself in the local universe
			// and ALWAYS send an eviction notice when the job is 
			// removed
			this->u_log->logEvict( this->job_ad, false );
			did_ulog_event = true;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}
	}


	if( !did_schedd_update ) {
			// Use the final exit code to determine the update type.
			// This may be different from what is indicated by 'reason',
			// because a policy expression evaluted by checkAtExit() may
			// have changed things.
		update_t up_type = U_TERMINATE;
		switch( this->exit_code ) {
		case JOB_EXITED:
			up_type = U_TERMINATE;
			break;
		case JOB_SHOULD_REQUEUE:
			up_type = U_REQUEUE;
			break;
		case JOB_SHOULD_REMOVE:
			up_type = U_REMOVE;
			break;
		case JOB_SHOULD_HOLD:
		case JOB_MISSED_DEFERRAL_TIME:
			up_type = U_HOLD;
			break;
		default:
			EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
		}

			// Now that we've logged the event, we can update the job queue
			// If we're doing a fast shutdown, don't retry on failure.
		if ( !this->job_updater->updateJob( up_type ) && !fast_exit ) {
			dprintf( D_ALWAYS,
			         "Failed to update job queue - attempting to retry.\n" );
			retryJobCleanup();
			return ( false );
		}

		did_schedd_update = true;
	}

		//
		// Once the job's been updated in the queue, we can also try
		// sending email notification, if desired.
		// This returns void, so there's no way to test for failure.
		// Therefore, we don't bother with retry.
		//
	Email msg;
	switch( this->exit_code ) {
	case JOB_SHOULD_REQUEUE:
	case JOB_EXITED:
		msg.sendExit( job_ad, reason );
		break;
	case JOB_SHOULD_REMOVE: {
		char *remove_reason = NULL;
		this->job_ad->LookupString( ATTR_REMOVE_REASON, &remove_reason );
		msg.sendRemove( this->job_ad, remove_reason ? remove_reason : "" );
		free( remove_reason );
		break;
	}
	case JOB_SHOULD_HOLD: {
		char *hold_reason = NULL;
		this->job_ad->LookupString( ATTR_HOLD_REASON, &hold_reason );
		msg.sendHold( this->job_ad, hold_reason ? hold_reason : "" );
		free( hold_reason );
		break;
	}
	case JOB_MISSED_DEFERRAL_TIME:
		msg.sendHold( this->job_ad, "missed derreral time" );
		break;
	default:
		EXCEPT("Internal error in JICLocalSchedd::notifyJobExit: unexpected exit code %d",this->exit_code);
	}

		//
		// Lastly, we will call to write out the file. This was 
		// originally done in JICLocal::notifyJobExit(), but we no
		// longer call that
		//
	this->writeOutputAdFile( this->job_ad );

		//
		// Once we get here, everything has been successfully
		// wrapped up.
		//
	return true;
}