Esempio n. 1
0
bool JobServerObject::getSummary(const char* key, JobSummaryFields& _summary, AviaryStatus &_status) {
	Job* job = NULL;
	if (!(job = getValidKnownJob(key,_status))) {
		return false;
	}

    ClassAd classAd;
    job->getSummary ( classAd );
    // little cheat for ad problems with history lookups
    string str;
    if ( classAd.LookupString("JOB_AD_ERROR", str) )
    {
		aviUtilFmt(_status.text,"Error obtaining ClassAd for job '%s'; ",key);
		_status.text += str;
		dprintf(D_ALWAYS,"%s\n",_status.text.c_str());
        return false;
    }

	// return the limited attributes
    classAd.LookupString(ATTR_JOB_CMD,_summary.cmd);
	classAd.LookupString(ATTR_JOB_ARGUMENTS1,_summary.args1);
	classAd.LookupString(ATTR_JOB_ARGUMENTS2,_summary.args2);
	classAd.LookupString(ATTR_HOLD_REASON,_summary.hold_reason);
	classAd.LookupString(ATTR_RELEASE_REASON,_summary.release_reason);
	classAd.LookupString(ATTR_REMOVE_REASON,_summary.remove_reason);
	classAd.LookupString(ATTR_JOB_SUBMISSION,_summary.submission_id);
	classAd.LookupString(ATTR_OWNER,_summary.owner);
	classAd.LookupInteger(ATTR_Q_DATE,_summary.queued);
	classAd.LookupInteger(ATTR_ENTERED_CURRENT_STATUS,_summary.last_update);
	_summary.status = job->getStatus();

	_status.type = AviaryStatus::A_OK;
    return true;
}
Esempio n. 2
0
// For now, just return true if the constraint worked on at least
// one job, false if not.  Someday, we can fix up the tool to take
// advantage of all the slick info the schedd gives us back about this
// request.  
bool
doWorkByConstraint( const char* constraint, CondorError * errstack )
{
	ClassAd* ad = 0;
	int total_jobs = -1;
	bool rval = true;
	switch( mode ) {
	case JA_RELEASE_JOBS:
		ad = schedd->releaseJobs( constraint, actionReason, errstack );
		break;
	case JA_REMOVE_X_JOBS:
		ad = schedd->removeXJobs( constraint, actionReason, errstack );
		break;
	case JA_VACATE_JOBS:
		ad = schedd->vacateJobs( constraint, VACATE_GRACEFUL, errstack );
		break;
	case JA_VACATE_FAST_JOBS:
		ad = schedd->vacateJobs( constraint, VACATE_FAST, errstack );
		break;
	case JA_REMOVE_JOBS:
		ad = schedd->removeJobs( constraint, actionReason, errstack );
		break;
	case JA_HOLD_JOBS:
		ad = schedd->holdJobs( constraint, actionReason, holdReasonSubCode, errstack );
		break;
	case JA_SUSPEND_JOBS:
		ad = schedd->suspendJobs( constraint, actionReason, errstack );
		break;
	case JA_CONTINUE_JOBS:
		ad = schedd->continueJobs( constraint, actionReason, errstack );
		break;
	default:
		EXCEPT( "impossible: unknown mode in doWorkByConstraint" );
	}
	if( ! ad ) {
		had_error = true;
		rval = false;
	} else {
		int result = FALSE;
		if( !ad->LookupInteger(ATTR_ACTION_RESULT, result) ) {
			had_error = true;
			rval = false;
		}
		else if( !result ) {
			// There were no ads acted upon, but that doesn't
			// mean there was an error.  It's possible the schedd
			// had no jobs
		        if( !ad->LookupInteger(ATTR_TOTAL_JOB_ADS, total_jobs) || total_jobs > 0 ) {
				had_error = true;
			} else {
				// There were no jobs in the queue, so add a
				// more meaningful error message
				errstack->push("condor_rm", 0, "There are no jobs in the queue");
			}
			rval = false;
		}
	}
	return rval;
}
Esempio n. 3
0
bool
VMUniverseMgr::allocVM(pid_t s_pid, ClassAd &ad, char const *execute_dir)
{
	if( canCreateVM(&ad) == false ) {
		return false;
	}

	// Find memory for VM
	int vm_mem = 0;
	if( (ad.LookupInteger(ATTR_JOB_VM_MEMORY, vm_mem) != 1) &&
	    (ad.LookupInteger(ATTR_REQUEST_MEMORY, vm_mem) != 1) ) {
		dprintf(D_ALWAYS, "Can't find VM memory in Job ClassAd\n");
		return false;
	}

	int vcpus = 0;
	if( (ad.LookupInteger(ATTR_JOB_VM_VCPUS, vcpus) != 1) &&
	    (ad.LookupInteger(ATTR_REQUEST_CPUS, vcpus) != 1) )
	  {
	    dprintf(D_FULLDEBUG, "Defaulting to one CPU\n");
	    vcpus = 1;
	  }

	// check whether this pid already exists
	VMStarterInfo *oldinfo = findVMStarterInfoWithStarterPid(s_pid);
	if( oldinfo ) {
		freeVM(s_pid);
		// oldinfo is freed 
		oldinfo = NULL;
	}
	
	VMStarterInfo *newinfo = new VMStarterInfo;
	ASSERT(newinfo);

	m_vm_used_memory += vm_mem;

	newinfo->m_pid = s_pid;
	newinfo->m_memory = vm_mem;
	newinfo->m_job_ad = ad; 
	newinfo->m_execute_dir = execute_dir;
	newinfo->m_vcpus = vcpus;

	// If there exists MAC or IP address for a checkpointed VM,
	// we use them as initial values.
	MyString string_value;
	if( ad.LookupString(ATTR_VM_CKPT_MAC, string_value) == 1 ) {
		newinfo->m_vm_mac = string_value;
	}
	/*
	string_value = "";
	if( ad.LookupString(ATTR_VM_CKPT_IP, string_value) == 1 ) {
		newinfo->m_vm_ip = string_value;
	}
	*/

	m_vm_starter_list.Append(newinfo);
	return true;
}
Esempio n. 4
0
void CollectorEngine::
cleanHashTable (CollectorHashTable &hashTable, time_t now, HashFunc makeKey)
{
	ClassAd  *ad;
	int   	 timeStamp;
	int		 max_lifetime;
	AdNameHashKey  hk;
	double   timeDiff;
	MyString	hkString;

	hashTable.startIterations ();
	while (hashTable.iterate (ad))
	{
		// Read the timestamp of the ad
		if (!ad->LookupInteger (ATTR_LAST_HEARD_FROM, timeStamp)) {
			dprintf (D_ALWAYS, "\t\tError looking up time stamp on ad\n");
			continue;
		}

		// how long has it been since the last update?
		timeDiff = difftime( now, timeStamp );

		if( !ad->LookupInteger( ATTR_CLASSAD_LIFETIME, max_lifetime ) ) {
			max_lifetime = machineUpdateInterval;
		}

		// check if it has expired
		if ( timeDiff > (double) max_lifetime )
		{
			// then remove it from the segregated table
			(*makeKey) (hk, ad);
			hk.sprint( hkString );
			if( timeStamp == 0 ) {
				dprintf (D_ALWAYS,"\t\t**** Removing invalidated ad: \"%s\"\n", hkString.Value() );
			}
			else {
				dprintf (D_ALWAYS,"\t\t**** Removing stale ad: \"%s\"\n", hkString.Value() );
			    /* let the off-line plug-in know we are about to expire this ad, so it can
				   potentially mark the ad absent. if expire() returns false, then delete
				   the ad as planned; if it return true, it was likely marked as absent,
				   so then this ad should NOT be deleted. */
				if ( CollectorDaemon::offline_plugin_.expire( *ad ) == true ) {
					// plugin say to not delete this ad, so continue
					continue;
				} else {
					dprintf (D_ALWAYS,"\t\t**** Removing stale ad: \"%s\"\n", hkString.Value() );
				}
			}
			if (hashTable.remove (hk) == -1)
			{
				dprintf (D_ALWAYS, "\t\tError while removing ad\n");
			}
			delete ad;
		}
	}
}
Esempio n. 5
0
bool
Defrag::drain(const ClassAd &startd_ad)
{
	std::string name;
	startd_ad.LookupString(ATTR_NAME,name);

	dprintf(D_ALWAYS,"Initiating %s draining of %s.\n",
			m_draining_schedule_str.c_str(),name.c_str());

	DCStartd startd( &startd_ad );

	int graceful_completion = 0;
	startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION,graceful_completion);
	int quick_completion = 0;
	startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION,quick_completion);
	int graceful_badput = 0;
	startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT,graceful_badput);
	int quick_badput = 0;
	startd_ad.LookupInteger(ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT,quick_badput);

	time_t now = time(NULL);
	std::string draining_check_expr;
	double badput_growth_tolerance = 1.25; // for now, this is hard-coded
	int negligible_badput = 1200;
	int negligible_deadline_slippage = 1200;
	if( m_draining_schedule <= DRAIN_GRACEFUL ) {
		dprintf(D_ALWAYS,"Expected draining completion time is %ds; expected draining badput is %d cpu-seconds\n",
				(int)(graceful_completion-now),graceful_badput);
		sprintf(draining_check_expr,"%s <= %d && %s <= %d",
				ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_COMPLETION,
				graceful_completion + negligible_deadline_slippage,
				ATTR_EXPECTED_MACHINE_GRACEFUL_DRAINING_BADPUT,
				(int)(badput_growth_tolerance*graceful_badput) + negligible_badput);
	}
	else { // DRAIN_FAST and DRAIN_QUICK are effectively equivalent here
		dprintf(D_ALWAYS,"Expected draining completion time is %ds; expected draining badput is %d cpu-seconds\n",
				(int)(quick_completion-now),quick_badput);
		sprintf(draining_check_expr,"%s <= %d && %s <= %d",
				ATTR_EXPECTED_MACHINE_QUICK_DRAINING_COMPLETION,
				quick_completion + negligible_deadline_slippage,
				ATTR_EXPECTED_MACHINE_QUICK_DRAINING_BADPUT,
				(int)(badput_growth_tolerance*quick_badput) + negligible_badput);
	}

	std::string request_id;
	bool resume_on_completion = true;
	bool rval = startd.drainJobs( m_draining_schedule, resume_on_completion, draining_check_expr.c_str(), request_id );
	if( !rval ) {
		dprintf(D_ALWAYS,"Failed to send request to drain %s: %s\n",startd.name(),startd.error());
		m_stats.DrainFailures += 1;
		return false;
	}
	m_stats.DrainSuccesses += 1;

	return true;
}
Esempio n. 6
0
void Defrag::loadState()
{
	FILE *fp;
	if( !(fp = safe_fopen_wrapper_follow(m_state_file.c_str(), "r")) ) {
		if( errno == ENOENT ) {
			dprintf(D_ALWAYS,"State file %s does not yet exist.\n",m_state_file.c_str());
		}
		else {
			EXCEPT("failed to load state from %s\n",m_state_file.c_str());
		}
	}
	else {
		int isEOF=0, errorReadingAd=0, adEmpty=0;
		ClassAd *ad = new ClassAd(fp, "...", isEOF, errorReadingAd, adEmpty);
		fclose( fp );

		if( errorReadingAd ) {
			dprintf(D_ALWAYS,"WARNING: failed to parse state from %s\n",m_state_file.c_str());
		}

		int timestamp = (int)m_last_poll;
		ad->LookupInteger(ATTR_LAST_POLL,timestamp);
		m_last_poll = (time_t)timestamp;

		dprintf(D_ALWAYS,"Last poll: %d\n",(int)m_last_poll);

		delete ad;
	}
}
Esempio n. 7
0
ULogEventOutcome
ReadUserLog::readEventXML( ULogEvent *& event )
{
	classad::ClassAdXMLParser xmlp;

	// we obtain a write lock here not because we want to write
	// anything, but because we want to ensure we don't read
	// mid-way through someone else's write
	Lock( true );

	// store file position so that if we are unable to read the event, we can
	// rewind to this location
  	long     filepos;
  	if (!m_fp || ((filepos = ftell(m_fp)) == -1L))
  	{
  		Unlock( true );
		event = NULL;
  		return ULOG_UNK_ERROR;
  	}

	ClassAd* eventad = new ClassAd();
	if ( !xmlp.ParseClassAd(m_fp, *eventad) ) {
		delete eventad;
		eventad = NULL;
	}

	Unlock( true );

	if( !eventad ) {
		// we don't have the full event in the stream yet; restore file
		// position and return
		if( fseek(m_fp, filepos, SEEK_SET) )	{
			dprintf(D_ALWAYS, "fseek() failed in ReadUserLog::readEvent");
			return ULOG_UNK_ERROR;
		}
		clearerr(m_fp);
		event = NULL;
		return ULOG_NO_EVENT;
	}

	int enmbr;
	if( !eventad->LookupInteger("EventTypeNumber", enmbr) ) {
		event = NULL;
		delete eventad;
		return ULOG_NO_EVENT;
	}

	if( !(event = instantiateEvent((ULogEventNumber) enmbr)) ) {
		event = NULL;
		delete eventad;
		return ULOG_UNK_ERROR;
	}

	event->initFromClassAd(eventad);

	delete eventad;
	return ULOG_OK;
}
Esempio n. 8
0
void test_user_policy_on_exit_remove_yes(void)
{
	int val;
	int action;
	char buf[4096];
	ClassAd *result;
	ClassAd *jad = new ClassAd;
	if (jad == NULL)
	{
		printf("Out of memory!\n");
		exit(EXIT_FAILURE);
	}
	
	printf("==========================================\n");
	printf("Testing User Policy on On Exit Remove: YES\n");

	/* set up the classad */
	sprintf(buf, "%s = %d", ATTR_ON_EXIT_CODE, 0);
	jad->Insert(buf);
	sprintf(buf, "%s = 42", ATTR_TOTAL_SUSPENSIONS);
	jad->Insert(buf);
	sprintf(buf, "%s = FALSE", ATTR_PERIODIC_HOLD_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = FALSE", ATTR_PERIODIC_REMOVE_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = FALSE", ATTR_ON_EXIT_HOLD_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = TotalSuspensions == 42", ATTR_ON_EXIT_REMOVE_CHECK);
	jad->Insert(buf);

	result = user_job_policy(jad);

	result->EvalBool(ATTR_USER_POLICY_ERROR, result, val);
	if(val == true)
	{
		printf("An error happened\n");
		delete result;
		return;
	}

	result->EvalBool(ATTR_TAKE_ACTION, result, val);
	if (val == true)
	{
		printf("%s was true.\n", ATTR_TAKE_ACTION);
		result->LookupInteger(ATTR_USER_POLICY_ACTION, action);
		printf("Action is: %s\n", 
			action==REMOVE_JOB?"REMOVE_JOB":action==HOLD_JOB?"HOLD_JOB":
			"UNKNOWN");
		result->LookupString(ATTR_USER_POLICY_FIRING_EXPR, buf);
		printf("Reason for action: %s\n", buf);
	}
	else
	{
		printf("Something went wrong. I should have had an action to take.\n");
	}
}
Esempio n. 9
0
int TransferQueueManager::HandleRequest(int cmd,Stream *stream)
{
	ReliSock *sock = (ReliSock *)stream;
	ASSERT( cmd == TRANSFER_QUEUE_REQUEST );

	ClassAd msg;
	sock->decode();
	if( !getClassAd( sock, msg ) || !sock->end_of_message() ) {
		dprintf(D_ALWAYS,
				"TransferQueueManager: failed to receive transfer request "
				"from %s.\n", sock->peer_description() );
		return FALSE;
	}

	bool downloading = false;
	MyString fname;
	MyString jobid;
	MyString queue_user;
	filesize_t sandbox_size;
	if( !msg.LookupBool(ATTR_DOWNLOADING,downloading) ||
		!msg.LookupString(ATTR_FILE_NAME,fname) ||
		!msg.LookupString(ATTR_JOB_ID,jobid) ||
		!msg.LookupString(ATTR_USER,queue_user) ||
		!msg.LookupInteger(ATTR_SANDBOX_SIZE,sandbox_size))
	{
		MyString msg_str;
		sPrintAd(msg_str, msg);
		dprintf(D_ALWAYS,"TransferQueueManager: invalid request from %s: %s\n",
				sock->peer_description(), msg_str.Value());
		return FALSE;
	}

		// Currently, we just create the client with the default max queue
		// age.  If it becomes necessary to customize the maximum age
		// on a case-by-case basis, it should be easy to adjust.

	TransferQueueRequest *client =
		new TransferQueueRequest(
			sock,
			sandbox_size,
			fname.Value(),
			jobid.Value(),
			queue_user.Value(),
			downloading,
			m_default_max_queue_age);

	if( !AddRequest( client ) ) {
		delete client;
		return KEEP_STREAM; // we have already closed this socket
	}

	return KEEP_STREAM;
}
Esempio n. 10
0
void
AviaryScheddPlugin::initialize()
{
		// Since this plugin is registered with multiple
		// PluginManagers it may be initialized more than once,
		// and we don't want that
	static bool skip = false;
	if (skip) return; skip = true;

		// WalkJobQueue(int (*func)(ClassAd *))
	ClassAd *ad = GetNextJob(1);
	while (ad != NULL) {
		string key;
		PROC_ID id;
		int value;

		// schedd should ignore these if missing
		if (!ad->LookupInteger(ATTR_CLUSTER_ID, id.cluster)) {
			dprintf(D_ALWAYS,"%s on job is missing or not an integer\n", ATTR_CLUSTER_ID);
			return;
		}
		if (!ad->LookupInteger(ATTR_PROC_ID, id.proc)) {
			dprintf(D_ALWAYS,"%s on job is missing or not an integer\n", ATTR_PROC_ID);
			return;
		}
		if (!ad->LookupInteger(ATTR_JOB_STATUS, value)) {
			dprintf(D_ALWAYS,"%s on job is missing or not an integer\n", ATTR_JOB_STATUS);
			return;
		}

		aviUtilFmt(key,"%d.%d", id.cluster, id.proc);

		processJob(key.c_str(), ATTR_JOB_STATUS, value);

		FreeJobAd(ad);
		ad = GetNextJob(0);
	}
 
	m_initialized = true;
}
Esempio n. 11
0
bool
CCBListener::SendMsgToCCB(ClassAd &msg,bool blocking)
{
	if( !m_sock ) {
		Daemon ccb(DT_COLLECTOR,m_ccb_address.Value());

		int cmd = -1;
		msg.LookupInteger( ATTR_COMMAND, cmd );
		if( cmd != CCB_REGISTER ) {
			dprintf(D_ALWAYS,"CCBListener: no connection to CCB server %s"
					" when trying to send command %d\n",
					m_ccb_address.Value(), cmd );
			return false;
		}

		// Specifying USE_TMP_SEC_SESSION to force a fresh security
		// session.  Otherwise we can end up in a catch-22 where we
		// are trying to reconnect to the CCB server and we try to use
		// a cached security session which is no longer valid, but our
		// CCB server cannot send us the invalidation message because
		// we are trying to reconnect to it.  Expring this session
		// right away is also a good idea, because if we are just
		// starting up, the return address associated with it will not
		// have any CCB information attached, which again means that
		// the CCB server has no way to invalidate it.

		if( blocking ) {
			m_sock = ccb.startCommand( cmd, Stream::reli_sock, CCB_TIMEOUT, NULL, NULL, false, USE_TMP_SEC_SESSION );
			if( m_sock ) {
				Connected();
			}
			else {
				Disconnected();
				return false;
			}
		}
		else if( !m_waiting_for_connect ) {
			m_sock = ccb.makeConnectedSocket(Stream::reli_sock, CCB_TIMEOUT, 0, NULL, true /*nonblocking*/ );
			if( !m_sock ) {
				Disconnected();
				return false;
			}
			m_waiting_for_connect = true;
			incRefCount(); // do not let ourselves be deleted until called back
			ccb.startCommand_nonblocking( cmd, m_sock, CCB_TIMEOUT, NULL, CCBListener::CCBConnectCallback, this, NULL, false, USE_TMP_SEC_SESSION );
			return false;
		}
	}

	return WriteMsgToCCB(msg);
}
Esempio n. 12
0
/*
 * This will be called when the event fd fires, indicating an OOM event.
 */
int
VanillaProc::outOfMemoryEvent(int /* fd */)
{

	/* The cgroup API generates this notification whenever the OOM fires OR
	 * the cgroup is removed. If the cgroups are not pre-created, the kernel will
	 * remove the cgroup when the job completes. So if we land here and there are
	 * no more job pids, we assume the cgroup was removed and just ignore the event.
	 * However, if we land here and we still have job pids, we assume the OOM fired
	 * and thus we place the job on hold. See gt#3824.
	 */

	// If we have no jobs left, prolly just cgroup removed, so do nothing and return
	if (num_pids == 0) {
		dprintf(D_FULLDEBUG, "Closing event FD pipe %d.\n", m_oom_efd);
		cleanupOOM();
		return 0;
	}

	// The OOM killer has fired, and the process is frozen.  However,
	// the cgroup still has accurate memory usage.  Let's grab that
	// and make a final update, so the user can see exactly how much
	// memory they used.
	ClassAd updateAd;
	PublishUpdateAd( &updateAd );
	Starter->jic->periodicJobUpdate( &updateAd, true );
	int usage;
	updateAd.LookupInteger(ATTR_MEMORY_USAGE, usage);

	std::stringstream ss;
	if (m_memory_limit >= 0) {
		ss << "Job has gone over memory limit of " << m_memory_limit << " megabytes. Peak usage: " << usage << " megabytes.";
	} else {
		ss << "Job has encountered an out-of-memory event.";
	}
	if( isCheckpointing ) {
		ss << "  This occurred while the job was checkpointing.";
	}

	dprintf( D_ALWAYS, "Job was held due to OOM event: %s\n", ss.str().c_str());

	dprintf(D_FULLDEBUG, "Closing event FD pipe %d.\n", m_oom_efd);
	cleanupOOM();

	// This ulogs the hold event and KILLS the shadow
	Starter->jic->holdJob(ss.str().c_str(), CONDOR_HOLD_CODE_JobOutOfResources, 0);

	return 0;
}
Esempio n. 13
0
float
Starter::percentCpuUsage( void )
{
	if (daemonCore->Get_Family_Usage(s_pid, s_usage, true) == FALSE) {
		EXCEPT( "Starter::percentCpuUsage(): Fatal error getting process "
		        "info for the starter and decendents" ); 
	}

	// In vm universe, there may be a process dealing with a VM.
	// Unfortunately, such a process is created by a specific 
	// running daemon for virtual machine program(such as vmware-serverd).
	// So our Get_Family_Usage is unable to get usage for the process.
	// Here, we call a function in vm universe manager.
	// If this starter is for vm universe, s_usage will be updated.
	// Otherwise, s_usage will not change.
	if ( resmgr ) {
		resmgr->m_vmuniverse_mgr.getUsageForVM(s_pid, s_usage);
		
		// Now try to tack on details posted in the job ad because 
		// hypervisors such as libvirt will spawn processes outside 
		// of condor's perview. 
		float fPercentCPU=0.0;
		int iNumCPUs=0;
		ClassAd * jobAd = s_claim->ad();
		
		jobAd->LookupFloat(ATTR_JOB_VM_CPU_UTILIZATION, fPercentCPU);
		jobAd->LookupInteger(ATTR_JOB_VM_VCPUS, iNumCPUs);
		
		// computations outside take cores into account.
		fPercentCPU = fPercentCPU * iNumCPUs;
		
		dprintf( D_LOAD, "Starter::percentCpuUsage() adding VM Utilization %f\n",fPercentCPU);
		
		s_usage.percent_cpu += fPercentCPU;
		
	}

	if( IsDebugVerbose(D_LOAD) ) {
		dprintf(D_LOAD,
		        "Starter::percentCpuUsage(): Percent CPU usage "
		        "for the family of starter with pid %u is: %f\n",
		        s_pid,
		        s_usage.percent_cpu );
	}

	return s_usage.percent_cpu;
}
Esempio n. 14
0
void test_oldstyle_with_exit(void)
{
	int val;
	int action;
	char buf[4096];
	ClassAd *result;
	ClassAd *jad = new ClassAd;
	if (jad == NULL)
	{
		printf("Out of memory!\n");
		exit(EXIT_FAILURE);
	}
	
	printf("==================================================\n");
	printf("Testing OldStyle job where it is marked as exited.\n");

	/* An oldstyle classad would have this */
	sprintf(buf, "%s = %d", ATTR_COMPLETION_DATE, 10); /* non zero */
	jad->Insert(buf);

	result = user_job_policy(jad);

	result->EvalBool(ATTR_USER_POLICY_ERROR, result, val);
	if(val == true)
	{
		printf("An error happened\n");
		delete result;
		return;
	}

	result->EvalBool(ATTR_TAKE_ACTION, result, val);
	if (val == true)
	{
		printf("%s was true.\n", ATTR_TAKE_ACTION);
		result->LookupInteger(ATTR_USER_POLICY_ACTION, action);
		printf("Action is: %s\n", 
			action==REMOVE_JOB?"REMOVE_JOB":action==HOLD_JOB?"HOLD_JOB":
			"UNKNOWN");
		result->LookupString(ATTR_USER_POLICY_FIRING_EXPR, buf);
		printf("Reason for action: %s\n", buf);
	}
	else
	{
		printf("Something went wrong. I should have had an action to take.\n");
	}
}
Esempio n. 15
0
ClassAd*
doWorkByList( StringList* ids, CondorError *errstack )
{
	ClassAd* rval = 0;
	switch( mode ) {
	case JA_RELEASE_JOBS:
		rval = schedd->releaseJobs( ids, actionReason, errstack );
		break;
	case JA_REMOVE_X_JOBS:
		rval = schedd->removeXJobs( ids, actionReason, errstack );
		break;
	case JA_VACATE_JOBS:
		rval = schedd->vacateJobs( ids, VACATE_GRACEFUL, errstack );
		break;
	case JA_VACATE_FAST_JOBS:
		rval = schedd->vacateJobs( ids, VACATE_FAST, errstack );
		break;
	case JA_REMOVE_JOBS:
		rval = schedd->removeJobs( ids, actionReason, errstack );
		break;
	case JA_HOLD_JOBS:
		rval = schedd->holdJobs( ids, actionReason, holdReasonSubCode, errstack );
		break;
	case JA_SUSPEND_JOBS:
		rval = schedd->suspendJobs( ids, actionReason, errstack );
		break;
	case JA_CONTINUE_JOBS:
		rval = schedd->continueJobs( ids, actionReason, errstack );
		break;
	default:
		EXCEPT( "impossible: unknown mode in doWorkByList" );
	}
	if( ! rval ) {
		had_error = true;
	} else {
		int result = FALSE;
		if( !rval->LookupInteger(ATTR_ACTION_RESULT, result) || !result ) {
			had_error = true;
		}
	}
	return rval;
}
Esempio n. 16
0
bool
CCBListener::ReadMsgFromCCB()
{
	if( !m_sock ) {
		return false;
	}
	m_sock->timeout(CCB_TIMEOUT);
	ClassAd msg;
	if( !msg.initFromStream( *m_sock ) || !m_sock->end_of_message() ) {
		dprintf(D_ALWAYS,
				"CCBListener: failed to receive message from CCB server %s\n",
				m_ccb_address.Value());
		Disconnected();
		return false;
	}

	m_last_contact_from_peer = time(NULL);
	RescheduleHeartbeat();

	int cmd = -1;
	msg.LookupInteger( ATTR_COMMAND, cmd );
	switch( cmd ) {
	case CCB_REGISTER:
		return HandleCCBRegistrationReply( msg );
	case CCB_REQUEST:
		return HandleCCBRequest( msg );
	case ALIVE:
		dprintf(D_FULLDEBUG,"CCBListener: received heartbeat from server.\n");
		return true;
	}

	MyString msg_str;
	msg.sPrint(msg_str);
	dprintf( D_ALWAYS,
			 "CCBListener: Unexpected message received from CCB "
			 "server: %s\n",
			 msg_str.Value() );
	return false;
}
Esempio n. 17
0
/* After the job exits, look into the classad to see if certain things are
	true or not */
void static_policy(void)
{
	ClassAd *result;
	int val;
	int action;
	char buf[4096];
	char buf2[4096];

	/* See what the user job policy has in store for me. */
	result = user_job_policy(JobAd);
	
	result->EvalBool(ATTR_USER_POLICY_ERROR, result, val);
	if (val == 1)
	{
		dprintf(D_ALWAYS, "There was an error in the static policy\n");
		delete result;
		return;
	}

	result->EvalBool(ATTR_TAKE_ACTION, result, val);
	if (val == 1)
	{
		result->LookupString(ATTR_USER_POLICY_FIRING_EXPR, buf, sizeof(buf));

		result->LookupInteger(ATTR_USER_POLICY_ACTION, action);
		switch(action)
		{
			case REMOVE_JOB:
				dprintf(D_ALWAYS, "Static Policy: removing job because %s has "
					"become true\n", buf);

				/* do nothing, the nasty old shadow logic takes it from here. */

				delete result;
				return;
				break;

			case HOLD_JOB:
				dprintf(D_ALWAYS, "Static Policy: holding job because %s has "
					"become true\n", buf);

				delete result;

				sprintf(buf, "Your job has been held because %s has become "
					"true\n", ATTR_PERIODIC_HOLD_CHECK);
				sprintf(buf2, "Your job has been held because %s has become "
					"true", ATTR_PERIODIC_HOLD_CHECK);

				/* This exits */
				HoldJob(buf, buf2, CONDOR_HOLD_CODE_JobPolicy, 0);
				return;
				break;

			default:
				dprintf(D_ALWAYS, "WARNING! Ignoring unknown action type in "
						"periodic_policy()\n");
				delete result;
				return;
				break;
		}
	}

	delete result;

	dprintf( D_ALWAYS, "Static policy: don't remove on exit\n" );
	EXCEPT( "Job didn't exit under conditions specifed in %s",
		ATTR_ON_EXIT_REMOVE_CHECK );
}
Esempio n. 18
0
ClassAd*
readJobAd( void )
{
    ClassAd* ad = NULL;
    bool is_stdin = false;
    bool read_something = false;

    ASSERT( job_ad_file );

    if( job_ad_file[0] == '-' && job_ad_file[1] == '\0' ) {
        fp = stdin;
        is_stdin = true;
    } else {
        if (fp == NULL) {
            fp = safe_fopen_wrapper_follow( job_ad_file, "r" );
            if( ! fp ) {
                EXCEPT( "Failed to open ClassAd file (%s): %s (errno %d)",
                        job_ad_file, strerror(errno), errno );
            }
        }
    }

    dprintf( D_FULLDEBUG, "Reading job ClassAd from %s\n",
             is_stdin ? "STDIN" : job_ad_file );

    ad = new ClassAd;
    MyString line;
    while( line.readLine(fp) ) {
        read_something = true;
        line.chomp();
        if( line[0] == '#' ) {
            dprintf( D_JOB, "IGNORING COMMENT: %s\n", line.Value() );
            continue;
        }
        if( line == "***" ) {
            dprintf( D_JOB, "Saw ClassAd delimitor, stopping\n" );
            break;
        }
        if( ! ad->Insert(line.Value()) ) {
            EXCEPT( "Failed to insert \"%s\" into ClassAd!", line.Value() );
        }
    }
    if( ! read_something ) {
        EXCEPT( "reading ClassAd from (%s): file is empty",
                is_stdin ? "STDIN" : job_ad_file );
    }
    if( IsDebugVerbose(D_JOB) ) {
        ad->dPrint( D_JOB );
    }

    // For debugging, see if there's a special attribute in the
    // job ad that sends us into an infinite loop, waiting for
    // someone to attach with a debugger
    int shadow_should_wait = 0;
    ad->LookupInteger( ATTR_SHADOW_WAIT_FOR_DEBUG,
                       shadow_should_wait );
    if( shadow_should_wait ) {
        dprintf( D_ALWAYS, "Job requested shadow should wait for "
                 "debugger with %s=%d, going into infinite loop\n",
                 ATTR_SHADOW_WAIT_FOR_DEBUG, shadow_should_wait );
        while( shadow_should_wait ) { }
    }

    return ad;
}
Esempio n. 19
0
void
CCBServer::HandleRequestResultsMsg( CCBTarget *target )
{
		// Reply from target daemon about whether it succeeded in
		// connecting to the requested client.

	Sock *sock = target->getSock();

	ClassAd msg;
	sock->decode();
	if( !msg.initFromStream( *sock ) || !sock->end_of_message() ) {
			// disconnect
		dprintf(D_FULLDEBUG,
				"CCB: received disconnect from target daemon %s "
				"with ccbid %lu.\n",
				sock->peer_description(), target->getCCBID() );
		RemoveTarget( target );
		return;
	}

	int command = 0;
	if( msg.LookupInteger( ATTR_COMMAND, command ) && command == ALIVE ) {
		SendHeartbeatResponse( target );
		return;
	}

	target->decPendingRequestResults();

	bool success = false;
	MyString error_msg;
	MyString reqid_str;
	CCBID reqid;
	MyString connect_id;
	msg.LookupBool( ATTR_RESULT, success );
	msg.LookupString( ATTR_ERROR_STRING, error_msg );
	msg.LookupString( ATTR_REQUEST_ID, reqid_str );
	msg.LookupString( ATTR_CLAIM_ID, connect_id );

	if( !CCBIDFromString( reqid, reqid_str.Value() ) ) {
		MyString msg_str;
		msg.sPrint(msg_str);
		dprintf(D_ALWAYS,
				"CCB: received reply from target daemon %s with ccbid %lu "
				"without a valid request id: %s\n",
				sock->peer_description(),
				target->getCCBID(),
				msg_str.Value());
		RemoveTarget( target );
		return;
	}

	CCBServerRequest *request = GetRequest( reqid );
	if( request && request->getSock()->readReady() ) {
		// Request socket must have just closed.  To avoid noise in
		// logs when we fail to write to it, delete the request now.
		RemoveRequest( request );
		request = NULL;
	}

	char const *request_desc = "(client which has gone away)";
	if( request ) {
		request_desc = request->getSock()->peer_description();
	}

	if( success ) {
		dprintf(D_FULLDEBUG,"CCB: received 'success' from target daemon %s "
				"with ccbid %lu for "
				"request %s from %s.\n",
				sock->peer_description(),
				target->getCCBID(),
				reqid_str.Value(),
				request_desc);
	}
	else {
		dprintf(D_FULLDEBUG,"CCB: received error from target daemon %s "
				"with ccbid %lu for "
				"request %s from %s: %s\n",
				sock->peer_description(),
				target->getCCBID(),
				reqid_str.Value(),
				request_desc,
				error_msg.Value());
	}

	if( !request ) {
		if( success ) {
				// expected: the client has gone away; it got what it wanted
			return;
		}
		dprintf( D_FULLDEBUG,
				 "CCB: client for request %s to target daemon %s with ccbid "
				 "%lu disappeared before receiving error details.\n",
				 reqid_str.Value(),
				 sock->peer_description(),
				 target->getCCBID());
		return;
	}
	if( connect_id != request->getConnectID() ) {
		MyString msg_str;
		msg.sPrint(msg_str);
		dprintf( D_FULLDEBUG,
				 "CCB: received wrong connect id (%s) from target daemon %s "
				 "with ccbid %lu for "
				 "request %s\n",
				 connect_id.Value(),
				 sock->peer_description(),
				 target->getCCBID(),
				 reqid_str.Value());
		RemoveTarget( target );
		return;
	}

	RequestFinished( request, success, error_msg.Value() );
}
Esempio n. 20
0
// The function occurs in a seperate thread or process
int
TransferD::write_files_thread(void *targ, Stream *sock)
{	
	ThreadArg *thread_arg = (ThreadArg*)targ;
	ReliSock *rsock = (ReliSock*)sock;
	TransferRequest *treq = NULL;
	// int protocol;
	SimpleList<ClassAd*> *jad_list = NULL;
	ClassAd *jad = NULL;
	int cluster, proc;
	int old_timeout;
	int result;
	ClassAd respad;

	// XXX This is a damn dirty hack whose solution resides in implementing
	// a checksum for the files.
	// Now we sleep here for one second.  Why?  So we are certain
	// to transfer back output files even if the job ran for less
	// than one second. This is because:
	// stat() can't tell the difference between:
	//   1) A job starts up, touches a file, and exits all in one second
	//   2) A job starts up, doesn't touch the file, and exits all in one
	//    second
	// So if we force the start time of the job to be one second later than
	// the time we know the files were written, stat() should be able
	// to perceive what happened, if anything.

	sleep(1);

	// even though I'm in a new process, I got here either through forking
	// or through a thread, so this memory is a copy.
	// protocol = thread_arg->protocol;
	treq = thread_arg->treq;
	delete thread_arg;

	// XXX deal with protocol value.

	////////////////////////////////////////////////////////////////////////
	// Sort the classads (XXX maybe put at a higher level in the protocol)
	////////////////////////////////////////////////////////////////////////
	
	// XXX TODO

	////////////////////////////////////////////////////////////////////////
	// Do the transfer.
	////////////////////////////////////////////////////////////////////////

	// file transfers can take a long time....
	old_timeout = rsock->timeout(60 * 60 * 8);

	jad_list = treq->todo_tasks();

	while(jad_list->Next(jad)) {
		FileTransfer ftrans;

		jad->LookupInteger(ATTR_CLUSTER_ID, cluster);
		jad->LookupInteger(ATTR_PROC_ID, proc);
		dprintf( D_ALWAYS, "TransferD::write_files_thread(): "
			"Transferring fileset for job %d.%d\n",
				cluster, proc);

		result = ftrans.SimpleInit(jad, true, true, rsock);
		if ( !result ) {
			dprintf( D_ALWAYS, "TransferD::write_files_thread(): "
				"failed to init file transfer for job %d.%d \n",
				cluster, proc );

			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"FileTransfer Object failed to SimpleInit.");
			respad.put(*rsock);
			rsock->end_of_message();

			rsock->timeout(old_timeout);

			return EXIT_FAILURE;
		}

		ftrans.setPeerVersion(treq->get_peer_version().Value());

		// We're "downloading" from the client to here.
		result = ftrans.DownloadFiles();
		if ( !result ) {

			dprintf( D_ALWAYS, "TransferD::write_files_thread(): "
				"failed to transfer files for job %d.%d \n",
				cluster, proc );

			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"FileTransfer Object failed to download.");
			respad.put(*rsock);
			rsock->end_of_message();

			rsock->timeout(old_timeout);
			return EXIT_FAILURE;
		}
	}

	rsock->end_of_message();

	//////////////////////////////////////////////////////////////////////////
	// Now that the file transfer is done, tell the client everything is ok.
	//////////////////////////////////////////////////////////////////////////

	dprintf(D_ALWAYS, "Informing client of finished transfer.\n");

	rsock->encode();

	respad.Assign(ATTR_TREQ_INVALID_REQUEST, FALSE);

	// This response ad to the client will contain:
	//
	//	ATTR_TREQ_INVALID_REQUEST (set to false)
	//
	respad.put(*rsock);
	rsock->end_of_message();

	delete rsock;

	return EXIT_SUCCESS;
}
Esempio n. 21
0
// This handler is called when a client wishes to write files from the
// transferd's storage.
int
TransferD::write_files_handler(int cmd, Stream *sock) 
{
	ReliSock *rsock = (ReliSock*)sock;
	MyString capability;
	int protocol = FTP_UNKNOWN;
	TransferRequest *treq = NULL;
	MyString fquser;
	static int transfer_reaper_id = -1;
	ThreadArg *thread_arg;
	int tid;
	ClassAd reqad;
	ClassAd respad;

	cmd = cmd; // quiet the compiler.

	dprintf(D_ALWAYS, "Got TRANSFERD_WRITE_FILES!\n");

	/////////////////////////////////////////////////////////////////////////
	// make sure we are authenticated
	/////////////////////////////////////////////////////////////////////////
	if( ! rsock->triedAuthentication() ) {
		CondorError errstack;
		if( ! SecMan::authenticate_sock(rsock, WRITE, &errstack) ) {
			// we failed to authenticate, we should bail out now
			// since we don't know what user is trying to perform
			// this action.
			// TODO: it'd be nice to print out what failed, but we
			// need better error propagation for that...
			errstack.push( "TransferD::setup_transfer_request_handler()", 42,
				"Failure to register transferd - Authentication failed" );
			dprintf( D_ALWAYS, "setup_transfer_request_handler() "
				"aborting: %s\n",
				errstack.getFullText() );
			refuse( rsock );
			return CLOSE_STREAM;
		} 
	}

	fquser = rsock->getFullyQualifiedUser();


	/////////////////////////////////////////////////////////////////////////
	// Check to see if the capability the client tells us is something that
	// we have knowledge of. We ONLY check the capability and not the
	// identity of the person in question. This allows people of different
	// identities to write files here as long as they had the right 
	// capability. While this might not sound secure, they STILL had to have
	// authenticated as someone this daemon trusts. 
	// Similarly, check the protocol it wants to use as well as ensure that
	// the direction the transfer request was supposed to be is being honored.
	/////////////////////////////////////////////////////////////////////////
	rsock->decode();

	// soak the request ad from the client about what it wants to transfer
	reqad.initFromStream(*rsock);
	rsock->end_of_message();

	reqad.LookupString(ATTR_TREQ_CAPABILITY, capability);

	rsock->encode();

	// do I know of such a capability?
	if (m_treqs.lookup(capability, treq) != 0) {
		// didn't find it. Log it and tell them to leave and close up shop
		respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
		respad.Assign(ATTR_TREQ_INVALID_REASON, "Invalid capability!");
		respad.put(*rsock);
		rsock->end_of_message();

		dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
			"using capability '%s', but there was no such capability. "
			"Access denied.\n", fquser.Value(), capability.Value());
		return CLOSE_STREAM;
	}

	reqad.LookupInteger(ATTR_TREQ_FTP, protocol);

	// am I willing to use this protocol?
	switch(protocol) {
		case FTP_CFTP: // FileTrans protocol, I'm happy.
			break;

		default:
			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"Invalid file transfer protocol!");
			respad.put(*rsock);
			rsock->end_of_message();

			dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
				"using protocol '%d', but I don't support that protocol. "
				"Access denied.\n", fquser.Value(), protocol);
			return CLOSE_STREAM;
	}

	// nsure that this transfer request was of the uploading variety
	if (treq->get_direction() != FTPD_UPLOAD) {
			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"Transfer Request was not an uploading request!");
			respad.put(*rsock);
			rsock->end_of_message();

			dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
				"to a transfer request that wasn't expecting to be written. "
				"Access denied.\n", fquser.Value());
	}

	/////////////////////////////////////////////////////////////////////////
	// Tell the client everything was ok.
	/////////////////////////////////////////////////////////////////////////

	respad.Assign(ATTR_TREQ_INVALID_REQUEST, FALSE);
	respad.put(*rsock);
	rsock->end_of_message();

	/////////////////////////////////////////////////////////////////////////
	// Set up a thread (a process under unix) to read ALL of the job files
	// for all of the ads in the TransferRequest.
	/////////////////////////////////////////////////////////////////////////

	// now create a thread, passing in the sock, which uses the file transfer
	// object to accept the files.

	if (transfer_reaper_id == -1) {
		// only set this up ONCE so each and every thread gets one.
		transfer_reaper_id = daemonCore->Register_Reaper(
						"write_files_reaper",
						(ReaperHandlercpp) &TransferD::write_files_reaper,
						"write_files_reaper",
						this
						);
	}

	thread_arg = new ThreadArg(protocol, treq);

	// Start a new thread (process on Unix) to do the work
	tid = daemonCore->Create_Thread(
		(ThreadStartFunc)&TransferD::write_files_thread,
		(void *)thread_arg,
		rsock,
		transfer_reaper_id
		);
	
	if (tid == FALSE) {
		// XXX How do I handle this failure?
	}


	// associate the tid with the request so I can deal with it propery in
	// the reaper
	m_client_to_transferd_threads.insert(tid, treq);

	// The stream is inherited to the thread, who does the transfer and
	// finishes the protocol, but in the parent, I'm closing it.
	return CLOSE_STREAM;
}
Esempio n. 22
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}

	// Increase the OOM score of this process; the child will inherit it.
	// This way, the job will be heavily preferred to be killed over a normal process.
	// OOM score is currently exponential - a score of 4 is a factor-16 increase in
	// the OOM score.
	setupOOMScore(4,800);
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
		/* Note on CONDOR_UNIVERSE_LOCAL - The cgroup setup code below
		 *  requires a unique name for the cgroup. It relies on
		 *  uniqueness of the MachineAd's Name
		 *  attribute. Unfortunately, in the local universe the
		 *  MachineAd (mach_ad elsewhere) is never populated, because
		 *  there is no machine. As a result the ASSERT on
		 *  starter_name fails. This means that the local universe
		 *  will not work on any machine that has BASE_CGROUP
		 *  configured. A potential workaround is to set
		 *  STARTER.BASE_CGROUP on any machine that is also running a
		 *  schedd, but that disables cgroup support from a
		 *  co-resident startd. Instead, I'm disabling cgroup support
		 *  from within the local universe until the intraction of
		 *  local universe and cgroups can be properly worked
		 *  out. -matt 7 nov '12
		 */
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		if (starter_name.size() == 0) {
			char buf[16];
			sprintf(buf, "%d", getpid());
			starter_name = buf;
		}
		//ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup_str += this->CgroupSuffix();
		
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	auto_free_ptr mount_under_scratch(param("MOUNT_UNDER_SCRATCH"));
	if (mount_under_scratch) {
		// try evaluating mount_under_scratch as a classad expression, if it is
		// an expression it must return a string. if it's not an expression, just
		// use it as a string (as we did before 8.3.6)
		classad::Value value;
		if (JobAd->EvaluateExpr(mount_under_scratch.ptr(), value)) {
			const char * pval = NULL;
			if (value.IsStringValue(pval)) {
				mount_under_scratch.set(strdup(pval));
			} else {
				// was an expression, but not a string, so report and error and fail.
				dprintf(D_ALWAYS | D_ERROR,
					"ERROR: MOUNT_UNDER_SCRATCH does not evaluate to a string, it is : %s\n",
					ClassAdValueToString(value));
				return FALSE;
			}
		}
	}

	// if execute dir is encrypted, add /tmp and /var/tmp to mount_under_scratch
	bool encrypt_execdir = false;
	JobAd->LookupBool(ATTR_ENCRYPT_EXECUTE_DIRECTORY,encrypt_execdir);
	if (encrypt_execdir || param_boolean_crufty("ENCRYPT_EXECUTE_DIRECTORY",false)) {
		// prepend /tmp, /var/tmp to whatever admin wanted. don't worry
		// if admin already listed /tmp etc - subdirs can appear twice
		// in this list because AddMapping() ok w/ duplicate entries
		MyString buf("/tmp,/var/tmp,");
		buf += mount_under_scratch.ptr();
		mount_under_scratch.set(buf.StrDup());
	}
	if (mount_under_scratch) {
		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							delete fs_remap;
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							delete fs_remap;
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						delete fs_remap;
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			delete fs_remap;
			return FALSE;
		}
		mount_under_scratch.clear();
	}

#if defined(LINUX)
	// On Linux kernel 2.6.24 and later, we can give each
	// job its own PID namespace
	if (param_boolean("USE_PID_NAMESPACES", false)) {
		if (!can_switch_ids()) {
			EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
				"call in Linux unless running as root.");
		}
		fi.want_pid_namespace = this->SupportsPIDNamespace();
		if (fi.want_pid_namespace) {
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			fs_remap->RemapProc();
		}

		// When PID Namespaces are enabled, need to run the job
		// under the condor_pid_ns_init program, so that signals
		// propagate through to the child.  

		// First tell the program where to log output status
		// via an environment variable
		if (param_boolean("USE_PID_NAMESPACE_INIT", true)) {
			Env env;
			MyString env_errors;
			MyString arg_errors;
			std::string filename;

			filename = Starter->GetWorkingDir();
			filename += "/.condor_pid_ns_status";
		
			env.MergeFrom(JobAd, &env_errors);
			env.SetEnv("_CONDOR_PID_NS_INIT_STATUS_FILENAME", filename);
			env.InsertEnvIntoClassAd(JobAd, &env_errors);

			Starter->jic->removeFromOutputFiles(condor_basename(filename.c_str()));
			this->m_pid_ns_status_filename = filename;
			
			// Now, set the job's CMD to the wrapper, and shift
			// over the arguments by one

			ArgList args;
			std::string cmd;

			JobAd->LookupString(ATTR_JOB_CMD, cmd);
			args.AppendArg(cmd);
			args.AppendArgsFromClassAd(JobAd, &arg_errors);
			args.InsertArgsIntoClassAd(JobAd, NULL, & arg_errors);
	
			std::string libexec;
			if( !param(libexec,"LIBEXEC") ) {
				dprintf(D_ALWAYS, "Cannot find LIBEXEC so can not run condor_pid_ns_init\n");
				return 0;
			}
			std::string c_p_n_i = libexec + "/condor_pid_ns_init";
			JobAd->Assign(ATTR_JOB_CMD, c_p_n_i);
		}
	}
	dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
#endif


	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	// See Note near setup of param(BASE_CGROUP)
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "CGROUP_MEMORY_LIMIT_POLICY", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				m_memory_limit = MemMb_big;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);

				// Note that ATTR_VIRTUAL_MEMORY on Linux
				// is sum of memory and swap, in Kilobytes

				int VMemKb;
				if (MachineAd->LookupInteger(ATTR_VIRTUAL_MEMORY, VMemKb)) {

					uint64_t memsw_limit = ((uint64_t)1024) * VMemKb;
					if (VMemKb > 0) {
						// we're not allowed to set memsw limit <
						// the hard memory limit.  If we haven't set the hard
						// memory limit, the default may be infinity.
						// So, if we've set soft, set hard limit to memsw - one page
						if (mem_is_soft) {
							uint64_t hard_limit = memsw_limit - 4096;
							climits.set_memory_limit_bytes(hard_limit, false);
						}
						climits.set_memsw_limit_bytes(memsw_limit);
					}
				} else {
					dprintf(D_ALWAYS, "Not setting virtual memory limit in cgroup because "
						"Virtual Memory attribute missing in machine ad.\n");
				}
			} else {
				dprintf(D_ALWAYS, "Not setting memory limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of CGROUP_MEMORY_LIMIT_POLICY: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int numCores = 1;
		if (MachineAd->LookupInteger(ATTR_CPUS, numCores)) {
			climits.set_cpu_shares(numCores*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of Cpus in machine ClassAd; ignoring.\n");
		}
		setupOOMEvent(cgroup);
	}

    m_statistics.Reconfig();

	// Now that the job is started, decrease the likelihood that the starter
	// is killed instead of the job itself.
	if (retval)
	{
		setupOOMScore(0,0);
	}

#endif

	return retval;
}
Esempio n. 23
0
void
doContactSchedd()
{
	int rc;
	Qmgr_connection *schedd;
	BaseJob *curr_job;
	ClassAd *next_ad;
	char expr_buf[12000];
	bool schedd_updates_complete = false;
	bool schedd_deletes_complete = false;
	bool add_remove_jobs_complete = false;
	bool update_jobs_complete = false;
	bool commit_transaction = true;
	int failure_line_num = 0;
	bool send_reschedule = false;
	std::string error_str = "";
	StringList dirty_job_ids;
	char *job_id_str;
	PROC_ID job_id;
	CondorError errstack;

	dprintf(D_FULLDEBUG,"in doContactSchedd()\n");

	initJobExprs();

	contactScheddTid = TIMER_UNSET;

	// vacateJobs
	/////////////////////////////////////////////////////
	if ( pendingScheddVacates.getNumElements() != 0 ) {
		std::string buff;
		StringList job_ids;
		VacateRequest curr_request;

		int result;
		ClassAd* rval;

		pendingScheddVacates.startIterations();
		while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
			formatstr( buff, "%d.%d", curr_request.job->procID.cluster,
						  curr_request.job->procID.proc );
			job_ids.append( buff.c_str() );
		}

		char *tmp = job_ids.print_to_string();
		if ( tmp ) {
			dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp );
			free(tmp);
			tmp = NULL;
		}

		rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack );
		if ( rval == NULL ) {
			formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!",
							   errstack.getFullText().c_str() );
			goto contact_schedd_failure;
		} else {
			pendingScheddVacates.startIterations();
			while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
				formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster,
							  curr_request.job->procID.proc );
				if ( !rval->LookupInteger( buff.c_str(), result ) ) {
					dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" );
					EXCEPT( "vacateJobs returned malformed ad" );
				} else {
					dprintf( D_FULLDEBUG, "   %d.%d vacate result: %d\n",
							 curr_request.job->procID.cluster,
							 curr_request.job->procID.proc,result);
					pendingScheddVacates.remove( curr_request.job->procID );
					curr_request.result = (action_result_t)result;
					curr_request.job->SetEvaluateState();
					completedScheddVacates.insert( curr_request.job->procID,
												   curr_request );
				}
			}
			delete rval;
		}
	}


	schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() );
	if ( !schedd ) {
		error_str = "Failed to connect to schedd!";
		goto contact_schedd_failure;
	}


	// CheckLeases
	/////////////////////////////////////////////////////
	if ( checkLeasesSignaled ) {

		dprintf( D_FULLDEBUG, "querying for renewed leases\n" );

		// Grab the lease attributes of all the jobs in our global hashtable.

		BaseJob::JobsByProcId.startIterations();

		while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) {
			int new_expiration;

			rc = GetAttributeInt( curr_job->procID.cluster,
								  curr_job->procID.proc,
								  ATTR_TIMER_REMOVE_CHECK,
								  &new_expiration );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// This job doesn't have doesn't have a lease from
						// the submitter. Skip it.
					continue;
				}
			}
			curr_job->UpdateJobLeaseReceived( new_expiration );
		}

		checkLeasesSignaled = false;
	}	// end of handling check leases


	// AddJobs
	/////////////////////////////////////////////////////
	if ( addJobsSignaled || firstScheddContact ) {
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for new jobs\n" );

		// Make sure we grab all Globus Universe jobs (except held ones
		// that we previously indicated we were done with)
		// when we first start up in case we're recovering from a
		// shutdown/meltdown.
		// Otherwise, grab all jobs that are unheld and aren't marked as
		// currently being managed and aren't marked as not matched.
		// If JobManaged is undefined, equate it with false.
		// If Matched is undefined, equate it with true.
		// NOTE: Schedds from Condor 6.6 and earlier don't include
		//   "(Universe==9)" in the constraint they give to the gridmanager,
		//   so this gridmanager will pull down non-globus-universe ads,
		//   although it won't use them. This is inefficient but not
		//   incorrect behavior.
		if ( firstScheddContact ) {
			// Grab all jobs for us to manage. This expression is a
			// derivative of the expression below for new jobs. We add
			// "|| Managed =?= TRUE" to also get jobs our previous
			// incarnation was in the middle of managing when it died
			// (if it died unexpectedly). With the new term, the
			// "&& Managed =!= TRUE" from the new jobs expression becomes
			// superfluous (by boolean logic), so we drop it.
			sprintf( expr_buf,
					 "%s && %s && ((%s && %s) || %s)",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_managed.c_str()
					 );
		} else {
			// Grab new jobs for us to manage
			sprintf( expr_buf,
					 "%s && %s && %s && %s && %s",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_not_managed.c_str()
					 );
		}
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *old_job;
			int job_is_matched = 1;		// default to true if not in ClassAd

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			bool job_is_managed = jobExternallyManaged(next_ad);
			next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched);

			if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) {

				JobType *job_type = NULL;
				BaseJob *new_job = NULL;

				// job had better be either managed or matched! (or both)
				ASSERT( job_is_managed || job_is_matched );

				if ( MustExpandJobAd( next_ad ) ) {
					// Get the expanded ClassAd from the schedd, which
					// has the GridResource filled in with info from
					// the matched ad.
					delete next_ad;
					next_ad = NULL;
					next_ad = GetJobAd(procID.cluster,procID.proc);
					if ( next_ad == NULL && errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
					if ( next_ad == NULL ) {
						// We may get here if it was not possible to expand
						// one of the $$() expressions.  We don't want to
						// roll back the transaction and blow away the
						// hold that the schedd just put on the job, so
						// simply skip over this ad.
						dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d.  errno=%d\n",procID.cluster,procID.proc,errno);
						goto contact_schedd_next_add_job;
					}
				}

				// Search our job types for one that'll handle this job
				jobTypes.Rewind();
				while ( jobTypes.Next( job_type ) ) {
					if ( job_type->AdMatchFunc( next_ad ) ) {

						// Found one!
						dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n",
								 job_type->Name, procID.cluster, procID.proc );
						break;
					}
				}

				if ( job_type != NULL ) {
					new_job = job_type->CreateFunc( next_ad );
				} else {
					dprintf( D_ALWAYS, "No handlers for job %d.%d\n",
							 procID.cluster, procID.proc );
					new_job = new BaseJob( next_ad );
				}

				ASSERT(new_job);
				new_job->SetEvaluateState();
				dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n",
						new_job->procID.cluster,new_job->procID.proc);
				num_ads++;

				if ( !job_is_managed ) {
					rc = tSetAttributeString( new_job->procID.cluster,
									   new_job->procID.proc,
									   ATTR_JOB_MANAGED,
									   MANAGED_EXTERNAL);
					if ( rc < 0 ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
				}

			} else {

				// We already know about this job, skip
				// But also set Managed=true on the schedd so that it won't
				// keep signalling us about it
				delete next_ad;
				rc = tSetAttributeString( procID.cluster, procID.proc,
								   ATTR_JOB_MANAGED, MANAGED_EXTERNAL );
				if ( rc < 0 ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				}

			}

contact_schedd_next_add_job:
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}	// end of while next_ad
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads);
	}	// end of handling add jobs


	// RemoveJobs
	/////////////////////////////////////////////////////

	// We always want to perform this check. Otherwise, we may overwrite a
	// REMOVED/HELD/COMPLETED status with something else below.
	{
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" );

		// Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we
		// haven't previously indicated that we're done with (by setting
		// JobManaged to "Schedd".
		sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))",
				 ScheddJobConstraint, expr_not_completely_done.c_str(),
				 ATTR_JOB_STATUS, REMOVED,
				 ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD,
				 ATTR_JOB_MANAGED, MANAGED_EXTERNAL );

		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *next_job;
			int curr_status;

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status );

			if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) {
				// Should probably skip jobs we already have marked as
				// held or removed

				next_job->JobAdUpdateFromSchedd( next_ad, true );
				num_ads++;

			} else if ( curr_status == REMOVED ) {

				// If we don't know about the job, act like we got an
				// ADD_JOBS signal from the schedd the next time we
				// connect, so that we'll create a Job object for it
				// and decide how it needs to be handled.
				// TODO The AddJobs and RemoveJobs queries shoule be
				//   combined into a single query.
				dprintf( D_ALWAYS, 
						 "Don't know about removed job %d.%d. "
						 "Will treat it as a new job to manage\n",
						 procID.cluster, procID.proc );
				addJobsSignaled = true;

			} else {

				dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. "
						 "Ignoring it\n",
						 procID.cluster, procID.proc );

			}

			delete next_ad;
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads);
	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	add_remove_jobs_complete = true;


	// Retrieve dirty attributes
	/////////////////////////////////////////////////////
	if ( updateJobsSignaled ) {
		dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" );

		sprintf( expr_buf, "%s && %s && %s && %s",
				 expr_schedd_job_constraint.c_str(), 
				 expr_not_completely_done.c_str(),
				 expr_not_held.c_str(),
				 expr_managed.c_str()
				 );
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			ClassAd updates;
			char str[PROC_ID_STR_BUFLEN];
			next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc );
			if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) {
				dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc );
				failure_line_num = __LINE__;
				delete next_ad;
				goto contact_schedd_disconnect;
		        }
			else {
				dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc);
				dPrintAd(D_JOB, updates);
			}
			if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
				curr_job->JobAdUpdateFromSchedd( &updates, false );
				ProcIdToStr( job_id, str );
				dirty_job_ids.append( str );
			}
			else {
				dprintf( D_ALWAYS, "Don't know about updated job %d.%d. "
						 "Ignoring it\n",
						 job_id.cluster, job_id.proc );
			}
			delete next_ad;
			next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 );
		}
	}
	update_jobs_complete = true;

//	if ( BeginTransaction() < 0 ) {
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}


	// requestJobStatus
	/////////////////////////////////////////////////////
	if ( pendingJobStatus.getNumElements() != 0 ) {
		JobStatusRequest curr_request;

		pendingJobStatus.startIterations();
		while ( pendingJobStatus.iterate( curr_request ) != 0 ) {

			int status;

			rc = GetAttributeInt( curr_request.job_id.cluster,
								  curr_request.job_id.proc,
								  ATTR_JOB_STATUS, &status );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so return a job status of REMOVED.
					status = REMOVED;
				}
			}
				// return status
			dprintf( D_FULLDEBUG, "%d.%d job status: %d\n",
					 curr_request.job_id.cluster,
					 curr_request.job_id.proc, status );
			pendingJobStatus.remove( curr_request.job_id );
			curr_request.job_status = status;
			daemonCore->Reset_Timer( curr_request.tid, 0 );
			completedJobStatus.insert( curr_request.job_id,
									   curr_request );
		}

	}


	// Update existing jobs
	/////////////////////////////////////////////////////
	ScheddUpdateRequest *curr_request;
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n",
				curr_job->procID.cluster, curr_job->procID.proc);
		const char *attr_name;
		const char *attr_value;
		ExprTree *expr;
		bool fake_job_in_queue = false;
		curr_job->jobAd->ResetExpr();
		while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true &&
				fake_job_in_queue == false ) {
			attr_value = ExprTreeToString( expr );

			dprintf(D_FULLDEBUG,"   %s = %s\n",attr_name,attr_value);
			rc = SetAttribute( curr_job->procID.cluster,
							   curr_job->procID.proc,
							   attr_name,
							   attr_value);
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so pretend that all updates for the job succeed.
						// Otherwise, we'll never make forward progress on
						// the job.
						// TODO We should also fake a job status of REMOVED
						//   to the job, so it can do what cleanup it can.
					fake_job_in_queue = true;
					break;
				}
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_updates_complete = true;


	// Delete existing jobs
	/////////////////////////////////////////////////////
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		if ( curr_job->deleteFromSchedd ) {
			dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n",
					curr_job->procID.cluster, curr_job->procID.proc);
			rc = DestroyProc(curr_job->procID.cluster,
							 curr_job->procID.proc);
				// NOENT means the job doesn't exist.  Good enough for us.
			if ( rc < 0 && rc != DESTROYPROC_ENOENT) {
				failure_line_num = __LINE__;
				commit_transaction = false;
				goto contact_schedd_disconnect;
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_deletes_complete = true;


 contact_schedd_disconnect:
	DisconnectQ( schedd, commit_transaction );

	if ( add_remove_jobs_complete == true ) {
		firstScheddContact = false;
		addJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( update_jobs_complete == true ) {
		updateJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( schedd_updates_complete == false ) {
		formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	// Clear dirty bits for all jobs updated
	if ( !dirty_job_ids.isEmpty() ) {
		ClassAd *rval;
		dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n",
				 dirty_job_ids.number() );
		dirty_job_ids.rewind();
		rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack );
		if ( rval == NULL ) {
			dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes.  CondorError: %s\n", errstack.getFullText().c_str() );
		}
		delete rval;
	}

	// Wake up jobs that had schedd updates pending and delete job
	// objects that wanted to be deleted
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		curr_job->jobAd->ClearAllDirtyFlags();

		if ( curr_job->deleteFromGridmanager ) {

				// If the Job object wants to delete the job from the
				// schedd but we failed to do so, don't delete the job
				// object yet; wait until we successfully delete the job
				// from the schedd.
			if ( curr_job->deleteFromSchedd == true &&
				 schedd_deletes_complete == false ) {
				continue;
			}

				// If wantRematch is set, send a reschedule now
			if ( curr_job->wantRematch ) {
				send_reschedule = true;
			}
			pendingScheddUpdates.remove( curr_job->procID );
			pendingScheddVacates.remove( curr_job->procID );
			pendingJobStatus.remove( curr_job->procID );
			completedJobStatus.remove( curr_job->procID );
			completedScheddVacates.remove( curr_job->procID );
			delete curr_job;

		} else {
			pendingScheddUpdates.remove( curr_job->procID );

			if ( curr_request->m_notify ) {
				curr_job->SetEvaluateState();
			}
		}

		delete curr_request;
	}

	// Poke objects that wanted to be notified when a schedd update completed
	// successfully (possibly minus deletes)
	int timer_id;
	scheddUpdateNotifications.Rewind();
	while ( scheddUpdateNotifications.Next( timer_id ) ) {
		daemonCore->Reset_Timer( timer_id, 0 );
	}
	scheddUpdateNotifications.Clear();

	if ( send_reschedule == true ) {
		ScheddObj->reschedule();
	}

	// Check if we have any jobs left to manage. If not, exit.
	if ( BaseJob::JobsByProcId.getNumElements() == 0 ) {
		dprintf( D_ALWAYS, "No jobs left, shutting down\n" );
		daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM );
	}

	lastContactSchedd = time(NULL);

	if ( schedd_deletes_complete == false ) {
		error_str = "Problem using DestroyProc to delete jobs!";
		goto contact_schedd_failure;
	}

	scheddFailureCount = 0;

	// For each job that had dirty attributes, re-evaluate the policy
	dirty_job_ids.rewind();
	while ( (job_id_str = dirty_job_ids.next()) != NULL ) {
		StrToProcIdFixMe(job_id_str, job_id);
		if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
			curr_job->EvalPeriodicJobExpr();
		}
	}

dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n");
	return;

 contact_schedd_failure:
	scheddFailureCount++;
	if ( error_str == "" ) {
		error_str = "Failure in doContactSchedd";
	}
	if ( scheddFailureCount >= maxScheddFailures ) {
		dprintf( D_ALWAYS, "%s\n", error_str.c_str() );
		EXCEPT( "Too many failures connecting to schedd!" );
	}
	dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() );
	lastContactSchedd = time(NULL);
	RequestContactSchedd();
	return;
}
Esempio n. 24
0
void test_user_policy_on_exit_hold_no(void)
{
	int val;
	int action;
	char buf[4096];
	ClassAd *result;
	ClassAd *jad = new ClassAd;
	if (jad == NULL)
	{
		printf("Out of memory!\n");
		exit(EXIT_FAILURE);
	}
	
	printf("=======================================\n");
	printf("Testing User Policy on On Exit Hold: NO\n");

	/* set up the classad */
	sprintf(buf, "%s = %d", ATTR_ON_EXIT_CODE, 0);
	jad->Insert(buf);
	sprintf(buf, "%s = 40", ATTR_TOTAL_SUSPENSIONS);
	jad->Insert(buf);
	sprintf(buf, "%s = FALSE", ATTR_PERIODIC_HOLD_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = FALSE", ATTR_PERIODIC_REMOVE_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = TotalSuspensions == 42", ATTR_ON_EXIT_HOLD_CHECK);
	jad->Insert(buf);
	sprintf(buf, "%s = TRUE", ATTR_ON_EXIT_REMOVE_CHECK);
	jad->Insert(buf);

	result = user_job_policy(jad);

	result->EvalBool(ATTR_USER_POLICY_ERROR, result, val);
	if(val == true)
	{
		printf("An error happened\n");
		delete result;
		return;
	}

	result->EvalBool(ATTR_TAKE_ACTION, result, val);
	if (val == true)
	{
		printf("%s was true.\n", ATTR_TAKE_ACTION);
		result->LookupInteger(ATTR_USER_POLICY_ACTION, action);
		printf("Action is: %s\n", 
			action==REMOVE_JOB?"REMOVE_JOB":action==HOLD_JOB?"HOLD_JOB":
			"UNKNOWN");
		result->LookupString(ATTR_USER_POLICY_FIRING_EXPR, buf);
		printf("Reason for action: %s\n", buf);

		if (strcmp(ATTR_USER_POLICY_FIRING_EXPR, ATTR_ON_EXIT_HOLD_CHECK) == 0)
		{
			printf("Failed. I got removed because exit_hold was true.\n");
		}
		else
		{
			printf("Success. I was removed because of %s, not because of %s\n", 
				ATTR_ON_EXIT_REMOVE_CHECK, ATTR_ON_EXIT_HOLD_CHECK);
		}
	}
	else
	{
		printf("Ignoring correctly.\n");
	}
}
Esempio n. 25
0
bool DockerProc::JobReaper( int pid, int status ) {
	TemporaryPrivSentry sentry(PRIV_ROOT);
	dprintf( D_ALWAYS, "DockerProc::JobReaper()\n" );

	//
	// This should mean that the container has terminated.
	//
	if( pid == JobPid ) {
		//
		// Even running Docker in attached mode, we have a race condition
		// is exiting when the container exits, not when the docker daemon
		// notices that the container has exited.
		//

		int rv = -1;
		bool running = false;
		ClassAd dockerAd;
		CondorError error;
		// Years of careful research.
		for( int i = 0; i < 20; ++i ) {
			rv = DockerAPI::inspect( containerName, & dockerAd, error );
			if( rv < 0 ) {
				dprintf( D_FULLDEBUG, "Failed to inspect (for removal) container '%s'; sleeping a second (%d already slept) to give Docker a chance to catch up.\n", containerName.c_str(), i );
				sleep( 1 );
				continue;
			}

			if( ! dockerAd.LookupBool( "Running", running ) ) {
				dprintf( D_FULLDEBUG, "Inspection of container '%s' failed to reveal its running state; sleeping a second (%d already slept) to give Docke a chance to catch up.\n", containerName.c_str(), i );
				sleep( 1 );
				continue;
			}

			if( running ) {
				dprintf( D_FULLDEBUG, "Inspection reveals that container '%s' is still running; sleeping a second (%d already slept) to give Docker a chance to catch up.\n", containerName.c_str(), i );
				sleep( 1 );
				continue;
			}

			break;
		}

// FIXME: Move all this shared conditional-checking into a function.

		if( rv < 0 ) {
			dprintf( D_ALWAYS | D_FAILURE, "Failed to inspect (for removal) container '%s'.\n", containerName.c_str() );
			std::string imageName;
			if( ! JobAd->LookupString( ATTR_DOCKER_IMAGE, imageName ) ) {
				dprintf( D_ALWAYS | D_FAILURE, "%s not defined in job ad.\n", ATTR_DOCKER_IMAGE );
				imageName = "Unknown"; // shouldn't ever happen
			}

			std::string message;
			formatstr(message, "Cannot start container: invalid image name: %s", imageName.c_str());

			Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_InvalidDockerImage, 0);
			return VanillaProc::JobReaper( pid, status );
		}

		if( ! dockerAd.LookupBool( "Running", running ) ) {
			dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal its running state.\n", containerName.c_str() );
			return VanillaProc::JobReaper( pid, status );
		}

		if( running ) {
			dprintf( D_ALWAYS | D_FAILURE, "Inspection reveals that container '%s' is still running.\n", containerName.c_str() );
			return VanillaProc::JobReaper( pid, status );
		}

		// FIXME: Rethink returning a classad.  Having to check for missing
		// attributes blows.

		// TODO: Set status appropriately (as if it were from waitpid()).
		std::string oomkilled;
		if (! dockerAd.LookupString( "OOMKilled", oomkilled)) {
			dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal whether it was OOM killed. Assuming it was not.\n", containerName.c_str() );
		}
		
		if (oomkilled.find("true") == 0) {
			ClassAd *machineAd = Starter->jic->machClassAd();
			int memory;
			machineAd->LookupInteger(ATTR_MEMORY, memory);
			std::string message;
			formatstr(message, "Docker job exhaused %d Mb memory", memory);
			dprintf(D_ALWAYS, "%s, going on hold\n", message.c_str());

			
			Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_JobOutOfResources, 0);
			DockerAPI::rm( containerName, error );

			if ( Starter->Hold( ) ) {
				Starter->allJobsDone();
				this->JobExit();
			}

			Starter->ShutdownFast();
			return 0;
		}

			// See if docker could not run the job
			// most likely invalid executable
		std::string dockerError;
		if (! dockerAd.LookupString( "DockerError", dockerError)) {
			dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal whether there was an internal docker error.\n", containerName.c_str() );
		}

		if (dockerError.length() > 0) {
			std::string message;
			formatstr(message, "Error running docker job: %s", dockerError.c_str());
			dprintf(D_ALWAYS, "%s, going on hold\n", message.c_str());

			
			Starter->jic->holdJob(message.c_str(), CONDOR_HOLD_CODE_FailedToCreateProcess, 0);
			DockerAPI::rm( containerName, error );

			if ( Starter->Hold( ) ) {
				Starter->allJobsDone();
				this->JobExit();
			}

			Starter->ShutdownFast();
			return 0;
		} 
		
		int dockerStatus;
		if( ! dockerAd.LookupInteger( "ExitCode", dockerStatus ) ) {
			dprintf( D_ALWAYS | D_FAILURE, "Inspection of container '%s' failed to reveal its exit code.\n", containerName.c_str() );
			return VanillaProc::JobReaper( pid, status );
		}
		dprintf( D_FULLDEBUG, "Setting status of Docker job to %d.\n", dockerStatus );
		status = dockerStatus;

		// TODO: Record final job usage.

		// We don't have to do any process clean-up, because container.
		// We'll do the disk clean-up after we've transferred files.
	}

	// This helps to make ssh-to-job more plausible.
	return VanillaProc::JobReaper( pid, status );
}
Esempio n. 26
0
/* evaluate various periodic checks during the running of the shadow and
	perform actions based upon what special attributes evaluate to. */
bool periodic_policy(void)
{
	ClassAd *result;
	int val;
	int action;
	char buf[4096];
	char buf2[4096];

	/* See what the user job policy has in store for me. */
	result = user_job_policy(JobAd);
	
	result->EvalBool(ATTR_USER_POLICY_ERROR, result, val);
	if (val == 1)
	{
		dprintf(D_ALWAYS, "There was an error in the periodic policy\n");
		delete result;
		return false;
	}

	result->EvalBool(ATTR_TAKE_ACTION, result, val);
	if (val == 1)
	{
		result->LookupString(ATTR_USER_POLICY_FIRING_EXPR, buf, sizeof(buf));

		result->LookupInteger(ATTR_USER_POLICY_ACTION, action);
		switch(action)
		{
			case REMOVE_JOB:
				dprintf(D_ALWAYS, "Periodic Policy: removing job because %s "
					"has become true\n", buf);
				/* set some yucky global variables */
				JobStatus = 0;
				JobExitStatus = 0;
				delete result;
				return true;
				break;

			case HOLD_JOB:
				sprintf(buf, "Periodic Policy: holding job because %s has "
					"become true\n", buf);

				sprintf(buf, "Your job has been held because %s has become "
					"true\n", ATTR_PERIODIC_HOLD_CHECK);
				sprintf(buf2, "Your job has been held because %s has become "
					"true", ATTR_PERIODIC_HOLD_CHECK);

				delete result;

				/* This exits */
				HoldJob(buf, buf2, CONDOR_HOLD_CODE_JobPolicy, 0);
				break;

			default:
				dprintf(D_ALWAYS, "WARNING! Ignoring unknown action type in "
						"periodic_policy()\n");
				delete result;
				return false;
				break;
		}
	}

	delete result;
	return false;
}
Esempio n. 27
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
	if (cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	char * mount_under_scratch = param("MOUNT_UNDER_SCRATCH");
	if (mount_under_scratch) {

		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);
			free(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			return FALSE;
		}
	}

	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	if (cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "MEMORY_LIMIT", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);
			} else {
				dprintf(D_ALWAYS, "Not setting memory soft limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory soft limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of MEMORY_LIMIT: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int slotWeight;
		if (MachineAd->LookupInteger(ATTR_SLOT_WEIGHT, slotWeight)) {
			climits.set_cpu_shares(slotWeight*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of SlotWeight in machine ClassAd; ignoring.\n");
		}
	}

#endif

	return retval;
}
Esempio n. 28
0
//
// Because we fork before calling docker, we don't actually
// care if the image is stored locally or not (except to the extent that
// remote image pull violates the principle of least astonishment).
//
int DockerAPI::run(
	ClassAd &machineAd,
	ClassAd &jobAd,
	const std::string & containerName,
	const std::string & imageID,
	const std::string & command,
	const ArgList & args,
	const Env & env,
	const std::string & sandboxPath,
	const std::list<std::string> extraVolumes,
	int & pid,
	int * childFDs,
	CondorError & /* err */ )
{
	gc_image(imageID);
	//
	// We currently assume that the system has been configured so that
	// anyone (user) who can run an HTCondor job can also run docker.  It's
	// also apparently a security worry to run Docker as root, so let's not.
	//
	ArgList runArgs;
	if ( ! add_docker_arg(runArgs))
		return -1;
	runArgs.AppendArg( "run" );

	// Write out a file with the container ID.
	// FIXME: The startd can check this to clean up after us.
	// This needs to go into a directory that condor user
	// can write to.

/*
	std::string cidFileName = sandboxPath + "/.cidfile";
	runArgs.AppendArg( "--cidfile=" + cidFileName );
*/

	
	// Configure resource limits.
	
	// First cpus
	int  cpus;
	int cpuShare;

	if (machineAd.LookupInteger(ATTR_CPUS, cpus)) {
		cpuShare = 10 * cpus;
	} else {
		cpuShare = 10;
	}
	std::string cpuShareStr;
	formatstr(cpuShareStr, "--cpu-shares=%d", cpuShare);
	runArgs.AppendArg(cpuShareStr);

	// Now memory
	int memory; // in Megabytes
	if (machineAd.LookupInteger(ATTR_MEMORY, memory)) {
		std::string mem;
		formatstr(mem, "--memory=%dm", memory);
		runArgs.AppendArg(mem);
	} 

	// drop unneeded Linux capabilities
	if (param_boolean("DOCKER_DROP_ALL_CAPABILITIES", true /*default*/,
		true /*do_log*/, &machineAd, &jobAd)) {
		runArgs.AppendArg("--cap-drop=all");
			
		// --no-new-privileges flag appears in docker 1.11
		if (DockerAPI::majorVersion > 1 ||
		    DockerAPI::minorVersion > 10) {
			runArgs.AppendArg("--no-new-privileges");
		}
	}

	// Give the container a useful name
	std::string hname = makeHostname(&machineAd, &jobAd);
	runArgs.AppendArg("--hostname");
	runArgs.AppendArg(hname.c_str());

		// Now the container name
	runArgs.AppendArg( "--name" );
	runArgs.AppendArg( containerName );

	if ( ! add_env_to_args_for_docker(runArgs, env)) {
		dprintf( D_ALWAYS | D_FAILURE, "Failed to pass enviroment to docker.\n" );
		return -8;
	}

	// Map the external sanbox to the internal sandbox.
	runArgs.AppendArg( "--volume" );
	runArgs.AppendArg( sandboxPath + ":" + sandboxPath );

	// Now any extra volumes
	for (std::list<std::string>::const_iterator it = extraVolumes.begin(); it != extraVolumes.end(); it++) {
		runArgs.AppendArg("--volume");
		std::string volume = *it;
		runArgs.AppendArg(volume);
	}
	
	// Start in the sandbox.
	runArgs.AppendArg( "--workdir" );
	runArgs.AppendArg( sandboxPath );

	// Run with the uid that condor selects for the user
	// either a slot user or submitting user or nobody
	uid_t uid = 0;
	uid_t gid = 0;

	// Docker doesn't actually run on Windows, but we compile
	// on Windows because...
#ifndef WIN32
	uid = get_user_uid();
	gid = get_user_gid();
#endif
	
	if ((uid == 0) || (gid == 0)) {
		dprintf(D_ALWAYS|D_FAILURE, "Failed to get userid to run docker job\n");
		return -9;
	}

	runArgs.AppendArg("--user");
	std::string uidgidarg;
	formatstr(uidgidarg, "%d:%d", uid, gid);
	runArgs.AppendArg(uidgidarg);

	// Run the command with its arguments in the image.
	runArgs.AppendArg( imageID );

	
	// If no command given, the default command in the image will run
	if (command.length() > 0) {
		runArgs.AppendArg( command );
	}

	runArgs.AppendArgsFromArgList( args );

	MyString displayString;
	runArgs.GetArgsStringForLogging( & displayString );
	dprintf( D_ALWAYS, "Attempting to run: %s\n", displayString.c_str() );

	//
	// If we run Docker attached, we avoid a race condition where
	// 'docker logs --follow' returns before 'docker rm' knows that the
	// container is gone (and refuses to remove it).  Of course, we
	// can't block, so we have a proxy process run attached for us.
	//
	FamilyInfo fi;
	fi.max_snapshot_interval = param_integer( "PID_SNAPSHOT_INTERVAL", 15 );
	int childPID = daemonCore->Create_Process( runArgs.GetArg(0), runArgs,
		PRIV_CONDOR_FINAL, 1, FALSE, FALSE, NULL, "/",
		& fi, NULL, childFDs );

	if( childPID == FALSE ) {
		dprintf( D_ALWAYS | D_FAILURE, "Create_Process() failed.\n" );
		return -1;
	}
	pid = childPID;

	return 0;
}
Esempio n. 29
0
int
main(int argc, char *argv[])
{
	char	*arg;
	int		nArgs = 0;				// number of args 
	int	 i, result;
	char* pool = NULL;
	char* scheddName = NULL;
	char* scheddAddr = NULL;
	MyString method;
	char *tmp;

	myDistro->Init( argc, argv );
	MyName = condor_basename(argv[0]);
	config();

#if !defined(WIN32)
	install_sig_handler(SIGPIPE, SIG_IGN );
#endif

	// dig around in the config file looking for what the config file says
	// about getting files from Condor. This defaults with the global variable
	// initialization.
	tmp = param( "SANDBOX_TRANSFER_METHOD" );
	if ( tmp != NULL ) {
		method = tmp;
		free( tmp );
		string_to_stm( method, st_method );
	}

	char **args = (char **)malloc(sizeof(char *) * argc); // args 
	if ( ! args) exit(2);

	// parse the arguments.
	for( argv++; (arg = *argv); argv++ ) {
		if( arg[0] == '-' ) {
			if( ! arg[1] ) {
				usage();
			}
			switch( arg[1] ) {
			case 'd':
				// dprintf to console
				dprintf_set_tool_debug("TOOL", 0);
				break;
			case 'c':
				args[nArgs] = arg;
				nArgs++;
				argv++;
				if( ! *argv ) {
					fprintf( stderr, 
							 "%s: -constraint requires another argument\n", 
							 MyName);
					exit(1);
				}				
				args[nArgs] = *argv;
				nArgs++;
				break;
			case 'a':
				if( arg[2] && arg[2] == 'd' ) {
					argv++;
					if( ! *argv ) {
						fprintf( stderr, 
								 "%s: -addr requires another argument\n", 
								 MyName);
						exit(1);
					}				
					if( is_valid_sinful(*argv) ) {
						scheddAddr = strdup(*argv);
						if( ! scheddAddr ) {
							fprintf( stderr, "Out of Memory!\n" );
							exit(1);
						}
					} else {
						fprintf( stderr, 
								 "%s: \"%s\" is not a valid address\n",
								 MyName, *argv );
						fprintf( stderr, "Should be of the form "
								 "<ip.address.here:port>\n" );
						fprintf( stderr, 
								 "For example: <123.456.789.123:6789>\n" );
						exit( 1 );
					}
					break;
				}
				All = true;
				break;
			case 'n': 
				// use the given name as the schedd name to connect to
				argv++;
				if( ! *argv ) {
					fprintf( stderr, "%s: -name requires another argument\n", 
							 MyName);
					exit(1);
				}			
				if ( scheddName ) free(scheddName);
				scheddName = strdup(*argv);
				break;
			case 'p':
				// use the given name as the central manager to query
				argv++;
				if( ! *argv ) {
					fprintf( stderr, "%s: -pool requires another argument\n", 
							 MyName);
					exit(1);
				}				
				if( pool ) {
					free( pool );
				}
				pool = strdup( *argv );
				break;
			case 's':
				argv++;
				if( ! *argv ) {
					fprintf( stderr, "%s: -stm requires another argument\n", 
							 MyName);
					exit(1);
				}				
				method = *argv;
				string_to_stm(method, st_method);
				break;
			case 'v':
				version();
				break;
			case 'h':
				usage(0);
				break;
			default:
				fprintf( stderr, "Unrecognized option: %s\n", arg ); 
				usage();
				break;
			}
		} else {
			if( All ) {
					// If -all is set, there should be no other
					// constraint arguments.
				usage();
			}
			args[nArgs] = arg;
			nArgs++;
		}
	}

	// Check to make sure we have a valid sandbox transfer mechanism.
	if (st_method == STM_UNKNOWN) {
		fprintf( stderr,
			"%s: Unknown sandbox transfer method: %s\n", MyName,
			method.Value());
		usage();
		exit(1);
	}

	if( ! (All || nArgs) ) {
			// We got no indication of what to act on


		fprintf( stderr, "You did not specify any jobs\n" ); 
		usage();
	}

		// We're done parsing args, now make sure we know how to
		// contact the schedd. 
	if( ! scheddAddr ) {
			// This will always do the right thing, even if either or
			// both of scheddName or pool are NULL.
		schedd = new DCSchedd( scheddName, pool );
	} else {
		schedd = new DCSchedd( scheddAddr );
	}
	if( ! schedd->locate() ) {
		fprintf( stderr, "%s: %s\n", MyName, schedd->error() ); 
		exit( 1 );
	}

		// Process the args.
	if( All ) {
		handleAll();
	} else {
		for(i = 0; i < nArgs; i++) {
			if( match_prefix( args[i], "-constraint" ) ) {
				i++;
				addConstraint( args[i] );
			} else {
				procArg(args[i]);
			}
		}
	}

		// Sanity check: make certain we now have a constraint
	if ( global_constraint.Length() <= 0 ) {			
		fprintf( stderr, "Unable to create a job constraint!\n");
		exit(1);
	}

	fprintf(stdout,"Fetching data files...\n");

	switch(st_method) {
		case STM_USE_SCHEDD_ONLY:
			{ // start block

			// Get the sandbox directly from the schedd.
			// And now, do the work.
			CondorError errstack;
			result = schedd->receiveJobSandbox(global_constraint.Value(),
				&errstack);
			if ( !result ) {
				fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() );
				fprintf( stderr, "ERROR: Failed to spool job files.\n" );
				exit(1);
			}
		
			// All done
			return 0;

			} //end block
			break;

		case STM_USE_TRANSFERD:
			{ // start block

			// NEW METHOD where we ask the schedd for a transferd, then get the
			// files from the transferd

			CondorError errstack;
			ClassAd respad;
			int invalid;
			MyString reason;
			MyString td_sinful;
			MyString td_cap;

			result = schedd->requestSandboxLocation(FTPD_DOWNLOAD, 
				global_constraint, FTP_CFTP, &respad, &errstack);
			if ( !result ) {
				fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() );
				fprintf( stderr, "ERROR: Failed to spool job files.\n" );
				exit(1);
			}

			respad.LookupInteger(ATTR_TREQ_INVALID_REQUEST, invalid);
			if (invalid == TRUE) {
				fprintf( stderr, "ERROR: Failed to spool job files.\n" );
				respad.LookupString(ATTR_TREQ_INVALID_REASON, reason);
				fprintf( stderr, "%s\n", reason.Value());
				exit(EXIT_FAILURE);
			}

			respad.LookupString(ATTR_TREQ_TD_SINFUL, td_sinful);
			respad.LookupString(ATTR_TREQ_CAPABILITY, td_cap);

			dprintf(D_ALWAYS, 
				"td: %s, cap: %s\n", td_sinful.Value(), td_cap.Value());

			DCTransferD dctd(td_sinful.Value());

			result = dctd.download_job_files(&respad, &errstack);
			if ( !result ) {
				fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() );
				fprintf( stderr, "ERROR: Failed to spool job files.\n" );
				exit(1);
			}

			} // end block
		break;

		default:
			EXCEPT("PROGRAMMER ERROR: st_method must be known.");
			break;
		}

	// All done
	return 0;
}
Esempio n. 30
0
bool
SchedulerObject::submit(AttributeMapType &jobAdMap, std::string &id, std::string &text)
{
	int cluster;
	int proc;

    if (!m_codec) {
        text = "Codec has not been initialized";
        return false;
    }

	// our mandatory set of attributes for a submit
	const char* required[] = {
				ATTR_JOB_CMD,
				ATTR_REQUIREMENTS,
				ATTR_OWNER,
				ATTR_JOB_IWD,
				NULL
				};

		// 1. Create transaction
	BeginTransaction();

		// 2. Create cluster
	if (-1 == (cluster = NewCluster())) {
		AbortTransaction();
		text = "Failed to create new cluster";
		return false;
	}

		// 3. Create proc
	if (-1 == (proc = NewProc(cluster))) {
		AbortTransaction();
		text = "Failed to create new proc";
		return false;
	}

		// 4. Submit job ad

		// Schema: (vanilla job)
		// Schedd demands - Owner, JobUniverse
		// To run - JobStatus, Requirements

		// Schedd excepts if no Owner
		// Schedd prunes on startup if no Owner or JobUniverse
		// Schedd won't run job without JobStatus
		// Job cannot match without Requirements
		// Shadow rejects jobs without an Iwd
		// Shadow: Job has no CondorVersion, assuming pre version 6.3.3
		// Shadow: Unix Vanilla job is pre version 6.3.3, setting 'TransferFiles = "NEVER"'
		// Starter won't run job without Cmd
		// Starter needs a valid Owner (local account name) if not using nobody
		// condor_q requires ClusterId (int), ProcId (int), QDate (int), RemoteUserCpu (float), JobStatus (int), JobPrio (int), ImageSize (int), Owner (str) and Cmd (str)

		// Schema: (vm job)
		// ShouldTransferFiles - unset by default, must be set

	ClassAd ad;
	int universe;

    // ShouldTransferFiles - unset by default, must be set
    // shadow will try to setup local transfer sandbox otherwise
    // without good priv
    ad.Assign(ATTR_SHOULD_TRANSFER_FILES, "NO");

	if (!m_codec->mapToClassAd(jobAdMap, ad, text)) {
		AbortTransaction();
		return false;
	}

	std::string missing;
	if (!checkRequiredAttrs(ad, required, missing)) {
		AbortTransaction();
		text = "Job ad is missing required attributes: " + missing;
		return false;
	}

		// EARLY SET: These attribute are set early so the incoming ad
		// has a change to override them.
	::SetAttribute(cluster, proc, ATTR_JOB_STATUS, "1"); // 1 = idle

		// Junk that condor_q wants, but really shouldn't be necessary
	::SetAttribute(cluster, proc, ATTR_JOB_REMOTE_USER_CPU, "0.0"); // float
	::SetAttribute(cluster, proc, ATTR_JOB_PRIO, "0");              // int
	::SetAttribute(cluster, proc, ATTR_IMAGE_SIZE, "0");            // int

	if (!ad.LookupInteger(ATTR_JOB_UNIVERSE, universe)) {
		char* uni_str = param("DEFAULT_UNIVERSE");
		if (!uni_str) {
			universe = CONDOR_UNIVERSE_VANILLA;
		}
		else {
			universe = CondorUniverseNumber(uni_str);
		}
		::SetAttributeInt(cluster, proc, ATTR_JOB_UNIVERSE, universe );
	}
	// more stuff - without these our idle stats are whack
	if ( universe != CONDOR_UNIVERSE_MPI && universe != CONDOR_UNIVERSE_PVM ) {
		::SetAttribute(cluster, proc, ATTR_MAX_HOSTS, "1");              // int
		::SetAttribute(cluster, proc, ATTR_MIN_HOSTS, "1");            // int
	}
	::SetAttribute(cluster, proc, ATTR_CURRENT_HOSTS, "0"); // int

	ExprTree *expr;
	const char *name;
	std::string value;
	ad.ResetExpr();
	while (ad.NextExpr(name,expr)) {

			// All these extra lookups are horrible. They have to
			// be there because the ClassAd may have multiple
			// copies of the same attribute name! This means that
			// the last attribute with a given name will set the
			// value, but the last attribute tends to be the
			// attribute with the oldest (wrong) value. How
			// annoying is that!
		if (!(expr = ad.Lookup(name))) {
			dprintf(D_ALWAYS, "Failed to lookup %s\n", name);

			AbortTransaction();
			text = "Failed to parse job ad attribute";
			return false;
		}

        value = ExprTreeToString(expr);
        ::SetAttribute(cluster, proc, name, value.c_str());
	}

		// LATE SET: These attributes are set late, after the incoming
		// ad, so they override whatever the incoming ad set.
	char buf[22]; // 22 is max size for an id, 2^32 + . + 2^32 + \0
	snprintf(buf, 22, "%d", cluster);
	::SetAttribute(cluster, proc, ATTR_CLUSTER_ID, buf);
	snprintf(buf, 22, "%d", proc);
	::SetAttribute(cluster, proc, ATTR_PROC_ID, buf);
	snprintf(buf, 22, "%ld", time(NULL));
	::SetAttribute(cluster, proc, ATTR_Q_DATE, buf);

		// Could check for some invalid attributes, e.g
		//  JobUniverse <= CONDOR_UNIVERSE_MIN or >= CONDOR_UNIVERSE_MAX

		// 5. Commit transaction
	CommitTransaction();


		// 6. Reschedule
	scheduler.needReschedule();


		// 7. Return identifier
		// TODO: dag ids?
	string tmp;
	//tmp.sprintf("%s#%d.%d", Name, cluster, proc);
	// we have other API compositions for job id and submission id
	// so let's return raw cluster.proc
	aviUtilFmt(tmp,"%d.%d", cluster, proc);
	id = tmp.c_str();

	return true;
}