Exemplo n.º 1
0
void
VMGahp::executeStart(VMRequest *req)
{
	// Expecting: VMGAHP_COMMAND_VM_START <req_id> <type>
	char* vmtype = req->m_args.argv[2];

	if( m_jobAd == NULL ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_JOBCLASSAD_INFO;
		return;
	}

	MyString vmworkingdir;
	if( m_jobAd->LookupString( "VM_WORKING_DIR", vmworkingdir) != 1 ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_JOBCLASSAD_INFO;
		vmprintf(D_ALWAYS, "VM_WORKING_DIR cannot be found in vm classAd\n");
		return;
	}

	MyString job_vmtype;
	if( m_jobAd->LookupString( ATTR_JOB_VM_TYPE, job_vmtype) != 1 ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_JOBCLASSAD_INFO;
		vmprintf(D_ALWAYS, "VM_TYPE('%s') cannot be found in vm classAd\n",
							ATTR_JOB_VM_TYPE);
		return;
	}

	if(strcasecmp(vmtype, job_vmtype.Value()) != 0 ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_SUPPORTED_VM_TYPE;
		vmprintf(D_ALWAYS, "Argument is %s but VM_TYPE in job classAD "
						"is %s\n", vmtype, job_vmtype.Value());
		return;
	}

	if(strcasecmp(vmtype, m_gahp_config->m_vm_type.Value()) != 0 ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_SUPPORTED_VM_TYPE;
		return;
	}

	VMType *new_vm = NULL;

	// TBD: tstclair this totally needs to be re-written
#if defined (HAVE_EXT_LIBVIRT) && !defined(VMWARE_ONLY)
	if(strcasecmp(vmtype, CONDOR_VM_UNIVERSE_XEN) == 0 ) {
		new_vm = new XenType( vmworkingdir.Value(), m_jobAd );
		ASSERT(new_vm);
	}else if(strcasecmp(vmtype, CONDOR_VM_UNIVERSE_KVM) == 0) {
	  new_vm = new KVMType(
				vmworkingdir.Value(), m_jobAd);
		ASSERT(new_vm);
	}else
#endif
	if(strcasecmp(vmtype, CONDOR_VM_UNIVERSE_VMWARE) == 0 ) {
		new_vm = new VMwareType(m_gahp_config->m_prog_for_script.Value(),
				m_gahp_config->m_vm_script.Value(),
				vmworkingdir.Value(), m_jobAd);
		ASSERT(new_vm);
	}else
	  {
		// We should not reach here
		vmprintf(D_ALWAYS, "vmtype(%s) is not yet implemented\n", vmtype);
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = VMGAHP_ERR_NO_SUPPORTED_VM_TYPE;
		return;
	}

	new_vm->Config();
	if( new_vm->CreateConfigFile() == false ) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = makeErrorMessage(new_vm->m_result_msg.Value());
		delete new_vm;
		new_vm = NULL;
		vmprintf(D_FULLDEBUG, "CreateConfigFile fails in executeStart!\n");
		return;
	}

	int result = new_vm->Start();

	if(result == false) {
		req->m_has_result = true;
		req->m_is_success = false;
		req->m_result = makeErrorMessage(new_vm->m_result_msg.Value());
		delete new_vm;
		new_vm = NULL;
		vmprintf(D_FULLDEBUG, "executeStart fail!\n");
		return;
	} else {
		// Success to create a new VM
		req->m_has_result = true;
		req->m_is_success = true;

		// Result is set to a new vm_id
		req->m_result = "";
		req->m_result += new_vm->getVMId();

		addVM(new_vm);
		vmprintf(D_FULLDEBUG, "executeStart success!\n");
		return;
	}
}
Exemplo n.º 2
0
bool init_local_hostname_impl()
{
	bool local_hostname_initialized = false;
	if (param(local_hostname, "NETWORK_HOSTNAME")) {
		local_hostname_initialized = true;
		dprintf(D_HOSTNAME, "NETWORK_HOSTNAME says we are %s\n", local_hostname.Value());
	}

	if( ! local_hostname_initialized ) {
		// [TODO:IPV6] condor_gethostname is not IPv6 safe. Reimplement it.
		char hostname[MAXHOSTNAMELEN];
		int ret = condor_gethostname(hostname, sizeof(hostname));
		if (ret) {
			dprintf(D_ALWAYS, "condor_gethostname() failed. Cannot initialize "
					"local hostname, ip address, FQDN.\n");
			return false;
		}
		local_hostname = hostname;
	}

	MyString test_hostname = local_hostname;

	bool local_ipaddr_initialized = false;
	bool local_ipv4addr_initialized = false;
	bool local_ipv6addr_initialized = false;

	MyString network_interface;
	if (param(network_interface, "NETWORK_INTERFACE")) {
		if(local_ipaddr_initialized == false &&
			local_ipaddr.from_ip_string(network_interface)) {
			local_ipaddr_initialized = true;
			if(local_ipaddr.is_ipv4()) { 
				local_ipv4addr = local_ipaddr;
				local_ipv4addr_initialized = true;
			}
			if(local_ipaddr.is_ipv6()) { 
				local_ipv6addr = local_ipaddr;
				local_ipv6addr_initialized = true;
			}
		}
	}

	if( ! local_ipaddr_initialized ) {
		std::string ipv4, ipv6, ipbest;
		if( network_interface_to_ip("NETWORK_INTERFACE", network_interface.Value(), ipv4, ipv6, ipbest, NULL)) {
			ASSERT(local_ipaddr.from_ip_string(ipbest));
			// If this fails, network_interface_to_ip returns something invalid.
			local_ipaddr_initialized = true;
		} else {
			dprintf(D_ALWAYS, "Unable to identify IP address from interfaces.  None match NETWORK_INTERFACE=%s. Problems are likely.\n", network_interface.Value());
		}
		if((!ipv4.empty()) && local_ipv4addr.from_ip_string(ipv4)) {
			local_ipv4addr_initialized = true;
			ASSERT(local_ipv4addr.is_ipv4());
		}
		if((!ipv6.empty()) && local_ipv6addr.from_ip_string(ipv6)) {
			local_ipv6addr_initialized = true;
			ASSERT(local_ipv6addr.is_ipv6());
		}
	}

	bool local_fqdn_initialized = false;
	if (nodns_enabled()) {
			// condor_gethostname() returns a hostname with
			// DEFAULT_DOMAIN_NAME. Thus, it is always fqdn
		local_fqdn = local_hostname;
		local_fqdn_initialized = true;
		if (!local_ipaddr_initialized) {
			local_ipaddr = convert_hostname_to_ipaddr(local_hostname);
			local_ipaddr_initialized = true;
		}
	}

	addrinfo_iterator ai;

	if( ! nodns_enabled() ) {
		const int MAX_TRIES = 20;
		const int SLEEP_DUR = 3;
		bool gai_success = false;
		for(int try_count = 1; true; try_count++) {
			addrinfo hint = get_default_hint();
			hint.ai_family = AF_UNSPEC;
			int ret = ipv6_getaddrinfo(test_hostname.Value(), NULL, ai, hint);
			if(ret == 0) { gai_success = true; break; }
			if(ret != EAI_AGAIN ) {
				dprintf(D_ALWAYS, "init_local_hostname_impl: ipv6_getaddrinfo() could not look up '%s': %s (%d).  Error is not recoverable; giving up.  Problems are likely.\n", test_hostname.Value(), gai_strerror(ret), ret );
				gai_success = false;
				break;
			}

			dprintf(D_ALWAYS, "init_local_hostname_impl: ipv6_getaddrinfo() returned EAI_AGAIN for '%s'.  Will try again after sleeping %d seconds (try %d of %d).\n", test_hostname.Value(), SLEEP_DUR, try_count + 1, MAX_TRIES );
			if(try_count == MAX_TRIES) {
				dprintf(D_ALWAYS, "init_local_hostname_impl: ipv6_getaddrinfo() never succeeded. Giving up. Problems are likely\n");
				break;
			}
			sleep(SLEEP_DUR);
		}

		if(gai_success) {
			int local_hostname_desireability = 0;
#ifdef TEST_DNS_TODO
			int local_ipaddr_desireability = 0;
			int local_ipv4addr_desireability = 0;
			int local_ipv6addr_desireability = 0;
#endif
			while (addrinfo* info = ai.next()) {
				// TODO: the only time ai_canonname should be set is the first
				// record. Why are we testing its desirability?
				const char* name = info->ai_canonname;
				if (!name)
					continue;
				condor_sockaddr addr(info->ai_addr);

				int desireability = addr.desirability();

				const char * result = "skipped for low score";
				if(desireability > local_hostname_desireability) {
					result = "new winner";
					dprintf(D_HOSTNAME, "   I like it.\n");
					local_hostname_desireability = desireability;

					const char* dotpos = strchr(name, '.');
					if (dotpos) { // consider it as a FQDN
						local_fqdn = name;
						local_hostname = local_fqdn.Substr(0, dotpos-name-1);
					} else {
						local_hostname = name;
						local_fqdn = local_hostname;
						MyString default_domain;
						if (param(default_domain, "DEFAULT_DOMAIN_NAME")) {
							if (default_domain[0] != '.')
								local_fqdn += ".";
							local_fqdn += default_domain;
						}
					}
				}
				dprintf(D_HOSTNAME, "hostname: %s (score %d) %s\n", name, desireability, result);

#ifdef TEST_DNS_TODO
				// Resist urge to set local_ip*addr_initialized=true,
				// We want to repeatedly retest this looking for 
				// better results.
				if (!local_ipaddr_initialized) {
					replace_higher_scoring_addr("IP", 
						local_ipaddr, local_ipaddr_desireability, 
						addr, desireability);
				}

				if (addr.is_ipv4() && !local_ipv4addr_initialized) {
					replace_higher_scoring_addr("IPv4", 
						local_ipv4addr, local_ipv4addr_desireability, 
						addr, desireability);
				}

				if (addr.is_ipv6() && !local_ipv6addr_initialized) {
					replace_higher_scoring_addr("IPv6", 
						local_ipv6addr, local_ipv6addr_desireability, 
						addr, desireability);
				}
#else
	// Make Fedora quit complaining.
	if( local_ipv4addr_initialized && local_ipv6addr_initialized && local_fqdn_initialized ) {
		local_ipv4addr_initialized = false;
		local_ipv6addr_initialized = false;
		local_fqdn_initialized = false;
	}
#endif
			}
		}

	}

	return true;
}
Exemplo n.º 3
0
void
BaseShadow::terminateJob( update_style_t kind ) // has a default argument of US_NORMAL
{
	int reason;
	bool signaled;
	MyString str;

	if( ! jobAd ) {
		dprintf( D_ALWAYS, "In terminateJob() w/ NULL JobAd!" );
	}

	/* The first thing we do is record that we are in a termination pending
		state. */
	if (kind == US_NORMAL) {
		str.formatstr("%s = TRUE", ATTR_TERMINATION_PENDING);
		jobAd->Insert(str.Value());
	}

	if (kind == US_TERMINATE_PENDING) {
		// In this case, the job had already completed once and the
		// status had been saved to the job queue, however, for
		// some reason, the shadow didn't exit with a good value and
		// the job had been requeued. When this style of update
		// is used, it is a shortcut from the very birth of the shadow
		// to here, and so there will not be a remote resource or
		// anything like that set up. In this situation, we just
		// want to write the log event and mail the user and exit
		// with a good exit code so the schedd removes the job from
		// the queue. If for some reason the logging fails once again,
		// the process continues to repeat. 
		// This means at least once semantics for the termination event
		// and user email, but at no time should the job actually execute
		// again.

		int exited_by_signal = FALSE;
		int exit_signal = 0;
		int exit_code = 0;

		getJobAdExitedBySignal(jobAd, exited_by_signal);
		if (exited_by_signal == TRUE) {
			getJobAdExitSignal(jobAd, exit_signal);
		} else {
			getJobAdExitCode(jobAd, exit_code);
		}

		if (exited_by_signal == TRUE) {
			reason = JOB_COREDUMPED;
			str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, core_file_name);
			jobAd->Insert(str.Value());
		} else {
			reason = JOB_EXITED;
		}

		dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 		getCluster(), getProc(), 
	 		exited_by_signal? "killed by signal" : "exited with status",
	 		exited_by_signal ? exit_signal : exit_code );
		
			// write stuff to user log, but get values from jobad
		logTerminateEvent( reason, kind );

			// email the user, but get values from jobad
		emailTerminateEvent( reason, kind );

		DC_Exit( reason );
	}

	// the default path when kind == US_NORMAL

	// cleanup this shadow (kill starters, etc)
	cleanUp();

	reason = getExitReason();
	signaled = exitedBySignal();

	/* also store the corefilename into the jobad so we can recover this 
		during a termination pending scenario. */
	if( reason == JOB_COREDUMPED ) {
		str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, getCoreName());
		jobAd->Insert(str.Value());
	}

    // Update final Job committed time
    int last_ckpt_time = 0;
    jobAd->LookupInteger(ATTR_LAST_CKPT_TIME, last_ckpt_time);
    int current_start_time = 0;
    jobAd->LookupInteger(ATTR_JOB_CURRENT_START_DATE, current_start_time);
    int int_value = (last_ckpt_time > current_start_time) ?
                        last_ckpt_time : current_start_time;

    if( int_value > 0 ) {
        int job_committed_time = 0;
        jobAd->LookupInteger(ATTR_JOB_COMMITTED_TIME, job_committed_time);
		int delta = (int)time(NULL) - int_value;
        job_committed_time += delta;
        jobAd->Assign(ATTR_JOB_COMMITTED_TIME, job_committed_time);

		float slot_weight = 1;
		jobAd->LookupFloat(ATTR_JOB_MACHINE_ATTR_SLOT_WEIGHT0, slot_weight);
		float slot_time = 0;
		jobAd->LookupFloat(ATTR_COMMITTED_SLOT_TIME, slot_time);
		slot_time += slot_weight * delta;
		jobAd->Assign(ATTR_COMMITTED_SLOT_TIME, slot_time);
    }

	CommitSuspensionTime(jobAd);

	// update the job ad in the queue with some important final
	// attributes so we know what happened to the job when using
	// condor_history...
    if (m_num_cleanup_retries < 1 &&
        param_boolean("SHADOW_TEST_JOB_CLEANUP_RETRY", false)) {
		dprintf( D_ALWAYS,
				 "Testing Failure to perform final update to job queue!\n");
		retryJobCleanup();
		return;
    }
	if( !updateJobInQueue(U_TERMINATE) ) {
		dprintf( D_ALWAYS, 
				 "Failed to perform final update to job queue!\n");
		retryJobCleanup();
		return;
	}

	// Let's maximize the effectiveness of that queue synchronization and
	// only record the job as done if the update to the queue was successful.
	// If any of these next operations fail and the shadow exits with an
	// exit code which causes the job to get requeued, it will be in the
	// "terminate pending" state marked by the ATTR_TERMINATION_PENDING
	// attribute.

	dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 	getCluster(), getProc(), 
	 	signaled ? "killed by signal" : "exited with status",
	 	signaled ? exitSignal() : exitCode() );

	// write stuff to user log:
	logTerminateEvent( reason );

	// email the user
	emailTerminateEvent( reason );

	if( reason == JOB_EXITED && claimIsClosing() ) {
			// Startd not accepting any more jobs on this claim.
			// We do this here to avoid having to treat this case
			// identically to JOB_EXITED in the code leading up to
			// this point.
		dprintf(D_FULLDEBUG,"Startd is closing claim, so no more jobs can be run on it.\n");
		reason = JOB_EXITED_AND_CLAIM_CLOSING;
	}

	// try to get a new job for this shadow
	if( recycleShadow(reason) ) {
		// recycleShadow delete's this, so we must return immediately
		return;
	}

	// does not return.
	DC_Exit( reason );
}
Exemplo n.º 4
0
void DCMessenger::startCommand( classy_counted_ptr<DCMsg> msg )
{
	MyString error;
	msg->setMessenger( this );

	if( msg->deliveryStatus() == DCMsg::DELIVERY_CANCELED ) {
		msg->callMessageSendFailed( this );
		return;
	}

	time_t deadline = msg->getDeadline();
	if( deadline && deadline < time(NULL) ) {
		msg->addError(CEDAR_ERR_DEADLINE_EXPIRED,
					  "deadline for delivery of this message expired");
		msg->callMessageSendFailed( this );
		return;
	}

		// For a UDP message, we may need to register two sockets, one for
		// the SafeSock and another for a ReliSock to establish the
		// security session.
	Stream::stream_type st = msg->getStreamType();
	if( daemonCore->TooManyRegisteredSockets(-1,&error,st==Stream::safe_sock?2:1) ) {
			// Try again in a sec
			// Eventually, it would be better to queue this centrally
			// (i.e. in DaemonCore) rather than having an independent
			// timer for each case.  Then it would be possible to control
			// priority of different messages etc.
		dprintf(D_FULLDEBUG, "Delaying delivery of %s to %s, because %s\n",
				msg->name(),peerDescription(),error.Value());
		startCommandAfterDelay( 1, msg );
		return;
	}

		// Currently, there may be only one pending operation per messenger.
	ASSERT(!m_callback_msg.get());
	ASSERT(!m_callback_sock);
	ASSERT(m_pending_operation == NOTHING_PENDING);

	m_pending_operation = START_COMMAND_PENDING;
	m_callback_msg = msg;
	m_callback_sock = m_sock.get();
	if( !m_callback_sock ) {

		if (IsDebugLevel(D_COMMAND)) {
			const char * addr = m_daemon->addr();
			const int cmd = msg->m_cmd;
			dprintf (D_COMMAND, "DCMessenger::startCommand(%s,...) making non-blocking connection to %s\n", getCommandStringSafe(cmd), addr ? addr : "NULL");
		}

		const bool nonblocking = true;
		m_callback_sock = m_daemon->makeConnectedSocket(st,msg->getTimeout(),msg->getDeadline(),&msg->m_errstack,nonblocking);
		if( !m_callback_sock ) {
			msg->callMessageSendFailed( this );
			return;
		}
	}

	incRefCount();
	m_daemon->startCommand_nonblocking (
		msg->m_cmd,
		m_callback_sock,
		msg->getTimeout(),
		&msg->m_errstack,
		&DCMessenger::connectCallback,
		this,
		msg->name(),
		msg->getRawProtocol(),
		msg->getSecSessionId());
}
Exemplo n.º 5
0
void reset_local_hostname() {
	if( ! init_local_hostname_impl() ) {
		dprintf( D_ALWAYS, "Something went wrong identifying my hostname and IP address.\n" );
		hostname_initialized = false;
	} else {
		dprintf( D_HOSTNAME, "I am: hostname: %s, fully qualified doman name: %s, IP: %s, IPv4: %s, IPv6: %s\n", local_hostname.Value(), local_fqdn.Value(), local_ipaddr.to_ip_string().Value(), local_ipv4addr.to_ip_string().Value(), local_ipv6addr.to_ip_string().Value() );
		hostname_initialized = true;
	}
}
Exemplo n.º 6
0
//-----------------------------------------------------------------------------
int
Script::BackgroundRun( int reaperId, int dagStatus, int failedCount )
{
	TmpDir		tmpDir;
	MyString	errMsg;
	if ( !tmpDir.Cd2TmpDir( _node->GetDirectory(), errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to node directory %s: %s\n",
				_node->GetDirectory(), errMsg.Value() );

		return 0;
	}

	// Construct the command line, replacing some tokens with
    // information about the job.  All of these values would probably
    // be better inserted into the environment, rather than passed on
    // the command-line... some should be in the job's env as well...

    const char *delimiters = " \t";
    char * token;
	ArgList args;
    char * cmd = strnewp(_cmd);
    for (token = strtok (cmd,  delimiters) ; token != NULL ;
         token = strtok (NULL, delimiters)) {

		MyString arg;

		if ( !strcasecmp( token, "$JOB" ) ) {
			arg += _node->GetJobName();

		} else if ( !strcasecmp( token, "$RETRY" ) ) {
            arg += _node->GetRetries();

		} else if ( !strcasecmp( token, "$MAX_RETRIES" ) ) {
            arg += _node->GetRetryMax();

        } else if ( !strcasecmp( token, "$JOBID" ) ) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $JOBID macro should "
							"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
				arg += token;
			} else {
            	arg += _node->_CondorID._cluster;
            	arg += '.';
            	arg += _node->_CondorID._proc;
			}

        } else if (!strcasecmp(token, "$RETURN")) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $RETURN macro should "
							"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
			}
			arg += _retValJob;

		} else if (!strcasecmp( token, "$PRE_SCRIPT_RETURN" ) ) {
			if ( !_post ) {
				debug_printf( DEBUG_QUIET, "Warning: $PRE_SCRIPT_RETURN macro should "
						"not be used as a PRE script argument!\n" );
				check_warning_strictness( DAG_STRICT_1 );
			}
			arg += _retValScript;

		} else if (!strcasecmp(token, "$DAG_STATUS")) {
			arg += dagStatus;

		} else if (!strcasecmp(token, "$FAILED_COUNT")) {
			arg += failedCount;

		} else if (token[0] == '$') {
			// This should probably be a fatal error when -strict is
			// implemented.
			debug_printf( DEBUG_QUIET, "Warning: unrecognized macro %s "
						"in node %s %s script arguments\n", token,
						_node->GetJobName(), _post ? "POST" : "PRE" );
			check_warning_strictness( DAG_STRICT_1 );
			arg += token;
        } else {
			arg += token;
		}

		args.AppendArg(arg.Value());
    }

	_pid = daemonCore->Create_Process( cmd, args,
									   PRIV_UNKNOWN, reaperId, FALSE,
									   NULL, NULL, NULL, NULL, NULL, 0 );
    delete [] cmd;

	if ( !tmpDir.Cd2MainDir( errMsg ) ) {
		debug_printf( DEBUG_QUIET,
				"Could not change to original directory: %s\n",
				errMsg.Value() );
		return 0;
	}

	return _pid;
}
Exemplo n.º 7
0
bool condor_sockaddr::from_sinful(const MyString& sinful) {
	return from_sinful(sinful.Value());
}
Exemplo n.º 8
0
UserProc::UserProc( STARTUP_INFO &s ) :
	cluster( s.cluster ),
	proc( s.proc ),
	m_a_out( NULL ),
	core_name( NULL ),
	uid( s.uid ),
	gid( s.gid ),
	v_pid( s.virt_pid ),
	pid( 0 ),
	job_class( s.job_class ),
	state( NEW ),
	user_time( 0 ),
	sys_time( 0 ),
	exit_status_valid( FALSE ),
	exit_status( 0 ),
	ckpt_wanted( s.ckpt_wanted ),
	soft_kill_sig( s.soft_kill_sig ),
	new_ckpt_created( FALSE ),
	ckpt_transferred( FALSE ),
	core_created( FALSE ),
	core_transferred( FALSE ),
	exit_requested( FALSE ),
	image_size( -1 ),
	guaranteed_user_time( 0 ),
	guaranteed_sys_time( 0 ),
	pids_suspended( -1 )
{
	MyString	buf;
	mode_t 	omask;

	cmd = new char [ strlen(s.cmd) + 1 ];
	strcpy( cmd, s.cmd );

	// Since we are adding to the argument list, we may need to deal
	// with platform-specific arg syntax in the user's args in order
	// to successfully merge them with the additional args.
	args.SetArgV1SyntaxToCurrentPlatform();

	MyString args_errors;
	if(!args.AppendArgsV1or2Raw(s.args_v1or2,&args_errors)) {
		EXCEPT("ERROR: Failed to parse arguments string: %s\n%s",
			   args_errors.Value(),s.args_v1or2);
	}

		// set up environment as an object
	MyString env_errors;
	if(!env_obj.MergeFromV1or2Raw( s.env_v1or2,&env_errors )) {
		EXCEPT("ERROR: Failed to parse environment string: %s\n%s",
			   env_errors.Value(),s.env_v1or2);
	}

		// add name of SMP slot (from startd) into environment
	setSlotEnv(&env_obj);

	/* Port regulation for user job */
	int low_port, high_port;

		// assume outgoing port range
	if (get_port_range(TRUE, &low_port, &high_port) == TRUE) {
		buf.formatstr( "_condor_LOWPORT=%d", low_port);
		env_obj.SetEnv(buf.Value());
		buf.formatstr( "_condor_HIGHPORT=%d", high_port);
		env_obj.SetEnv(buf.Value());
	}
	/* end - Port regulation for user job */

	if( param_boolean("BIND_ALL_INTERFACES", true) ) {
		buf.formatstr( "_condor_BIND_ALL_INTERFACES=TRUE" );
	} else {
		buf.formatstr( "_condor_BIND_ALL_INTERFACES=FALSE" );
	}
	env_obj.SetEnv(buf.Value());
	

		// Generate a directory where process can run and do its checkpointing
	omask = umask(0);
	buf.formatstr( "dir_%d", getpid() );
	local_dir = new char [ buf.Length() + 1 ];
	strcpy( local_dir, buf.Value() );
	if (privsep_enabled()) {
		// the Switchboard expects a full path to privsep_create_dir
		MyString local_dir_path;
		local_dir_path.formatstr("%s/%s", Execute, local_dir);
		if (!privsep_create_dir(get_condor_uid(), local_dir_path.Value())) {
			EXCEPT("privsep_create_dir failure");
		}
		if (chmod(local_dir_path.Value(), LOCAL_DIR_MODE) == -1) {
			EXCEPT("chmod failure after privsep_create_dir");
		}
	}
	else {
		if( mkdir(local_dir,LOCAL_DIR_MODE) < 0 ) {
			EXCEPT( "mkdir(%s,0%o)", local_dir, LOCAL_DIR_MODE );
		}
	}
	(void)umask(omask);

		// Now that we know what the local_dir is, put the path into
		// the environment so the job knows where it is
	MyString scratch_env;
	scratch_env.formatstr("CONDOR_SCRATCH_DIR=%s/%s",Execute,local_dir);
	env_obj.SetEnv(scratch_env.Value());

	buf.formatstr( "%s/condor_exec.%d.%d", local_dir, cluster, proc );
	cur_ckpt = new char [ buf.Length() + 1 ];
	strcpy( cur_ckpt, buf.Value() );

		// Find out if user wants checkpointing
#if defined(NO_CKPT)
	ckpt_wanted = FALSE;
	dprintf(D_ALWAYS,
			"This platform doesn't implement checkpointing yet\n"
	);
#else
	ckpt_wanted = s.ckpt_wanted;
#endif

	restart = s.is_restart;
	coredump_limit_exists = s.coredump_limit_exists;
	coredump_limit = s.coredump_limit;
}
Exemplo n.º 9
0
void
UserProc::execute()
{
	ArgList new_args;
	char    **argv;
	char	**argp;
	char	**envp;
	sigset_t	sigmask;
	MyString	a_out_name;
	MyString	shortname;
	int		user_syscall_fd = -1;
	const	int READ_END = 0;
	const	int WRITE_END = 1;
	int		pipe_fds[2];
	FILE	*cmd_fp;
	char	buf[128];
	ReliSock	*new_reli = NULL;

	pipe_fds[0] = -1;
	pipe_fds[1] = -1;

	shortname.formatstr( "condor_exec.%d.%d", cluster, proc );
	a_out_name.formatstr( "%s/%s/%s", Execute, local_dir, shortname.Value() );

		// Set up arg vector according to class of job
	switch( job_class ) {

	  case CONDOR_UNIVERSE_STANDARD:
		if( pipe(pipe_fds) < 0 ) {
			EXCEPT( "pipe()" );}

			dprintf( D_ALWAYS, "Pipe built\n" );
		
			// The user process should not try to read commands from
			// 0, 1, or 2 since we'll be using the commands to redirect
			// those.
		if( pipe_fds[READ_END] < 14 ) {
			dup2( pipe_fds[READ_END], 14 );
			close( pipe_fds[READ_END] );
			pipe_fds[READ_END] = 14;
		}
		dprintf( D_ALWAYS, "New pipe_fds[%d,%d]\n", pipe_fds[0], pipe_fds[1] );
		sprintf( buf, "%d", pipe_fds[READ_END] );
		dprintf( D_ALWAYS, "cmd_fd = %s\n", buf );

		new_args.AppendArg(shortname);
		new_args.AppendArg("-_condor_cmd_fd");
		new_args.AppendArg(buf);
		break;

	  case CONDOR_UNIVERSE_PVM:
#if 1
		EXCEPT( "Don't know how to deal with PVM jobs" );
#else
		new_args.AppendArg(shortname);
		new_args.AppendArg("-1");
		new_args.AppendArg(in);
		new_args.AppendArg(out);
		new_args.AppendArg(err);
#endif
		break;

	  case CONDOR_UNIVERSE_VANILLA:
	  	if (privsep_enabled()) {
			EXCEPT("Don't know how to deal with Vanilla jobs");
		}
		new_args.AppendArg(shortname.Value());
		break;
	}

	new_args.AppendArgsFromArgList(args);

		// take care of USER_JOB_WRAPPER
	support_job_wrapper(a_out_name,&new_args);

	MyString exec_name;
	exec_name = a_out_name;

	// If privsep is turned on, then we need to use the PrivSep
	// Switchboard to launch the job
	//
	FILE* switchboard_in_fp;
	FILE* switchboard_err_fp;
	int switchboard_child_in_fd;
	int switchboard_child_err_fd;
	if (privsep_enabled()) {

		// create the pipes that we'll use to communicate
		//
		if (!privsep_create_pipes(switchboard_in_fp,
		                          switchboard_child_in_fd,
		                          switchboard_err_fp,
		                          switchboard_child_err_fd)) {
			EXCEPT("can't launch job: privsep_create_pipes failure");
		}
	}

	argv = new_args.GetStringArray();

		// Set an environment variable that tells the job where it may put scratch data
		// even if it moves to a different directory.

		// get the environment vector
	envp = env_obj.getStringArray();

		// We may run more than one of these, so each needs its own
		// remote system call connection to the shadow
	if( job_class == CONDOR_UNIVERSE_PVM ) {
		new_reli = NewConnection( v_pid );
		user_syscall_fd = new_reli->get_file_desc();
	}

		// print out arguments to execve
	dprintf( D_ALWAYS, "Calling execve( \"%s\"", exec_name.Value() );
	for( argp = argv; *argp; argp++ ) {							// argv
		dprintf( D_ALWAYS | D_NOHEADER, ", \"%s\"", *argp );
	}
	dprintf( D_ALWAYS | D_NOHEADER, ", 0" );
	for( argp = envp; *argp; argp++ ) {							// envp
		dprintf( D_ALWAYS | D_NOHEADER, ", \"%s\"", *argp );
	}
	dprintf( D_ALWAYS | D_NOHEADER, ", 0 )\n" );


	if( (pid = fork()) < 0 ) {
		EXCEPT( "fork" );
	}

	if( pid == 0 ) {	// the child

			// Block only these 3 signals which have special meaning for
			// checkpoint/restart purposes.  Leave other signals ublocked
			// so that if we get an exception during the restart process,
			// we will get a core file to debug.
		sigemptyset( &sigmask );
		// for some reason if we block these, the user process is unable
		// to unblock some or all of them.
#if 0
		sigaddset( &sigmask, SIGUSR1 );
		sigaddset( &sigmask, SIGUSR2 );
		sigaddset( &sigmask, SIGTSTP );
#endif
		sigprocmask( SIG_SETMASK, &sigmask, 0 );

			// renice
		renice_self( "JOB_RENICE_INCREMENT" );

			// make certain the syscall sockets which are being passed
			// to the user job are setup to be blocking sockets.  this
			// is done by calling timeout(0) CEDAR method.
			// we must do this because the syscall lib does _not_ 
			// expect to see any failures due to errno EAGAIN...
		if ( SyscallStream ) {
			SyscallStream->timeout(0);
		}
		if ( new_reli ) {
			new_reli->timeout(0);
		}

			// If I'm using privledge separation, connect to the procd.
			// we need to register a family with the procd for the newly
			// created process, so that the ProcD will allow us to send
			// signals to it
			//
		if (privsep_enabled() == true) {
			MyString procd_address;
			bool response;
			bool ret;
			ProcFamilyClient pfc;

			procd_address = get_procd_address();
			ret = pfc.initialize(procd_address.Value());
			if (ret == false) {
				EXCEPT("Failure to initialize the ProcFamilyClient object");
			}

			ret = pfc.register_subfamily(getpid(), getppid(), 60, response);

			if (ret == false) {
				EXCEPT("Could not communicate with procd. Aborting.");
			}

			if (response == false) {
				EXCEPT("Procd refused to register job subfamily. Aborting.");
			}
		}

			// If there is a requested coresize for this job, enforce it.
			// Do it before the set_priv_final to ensure root can alter 
			// the coresize to the requested amount. Otherwise, just
			// use whatever the current default is.
		if (coredump_limit_exists == TRUE) {
			limit( RLIMIT_CORE, coredump_limit, CONDOR_HARD_LIMIT, "max core size" );
		}

			// child process should have only it's submitting uid, and cannot
			// switch back to root or some other uid.  
			// It'd be nice to check for errors here, but
			// unfortunately, we can't, since this only returns the
			// previous priv state, not whether it worked or not. 
			//  -Derek Wright 4/30/98
		set_user_priv_final();

		switch( job_class ) {
		  
		  case CONDOR_UNIVERSE_STANDARD:
			// if we're using PrivSep, the chdir here could fail. instead,
			// we pass the job's IWD to the switchboard via pipe
			//
		  	if (!privsep_enabled()) {
				if( chdir(local_dir) < 0 ) {
					EXCEPT( "chdir(%s)", local_dir );
				}
			}
			close( pipe_fds[WRITE_END] );
			break;

		  case CONDOR_UNIVERSE_PVM:
			if( chdir(local_dir) < 0 ) {
				EXCEPT( "chdir(%s)", local_dir );
			}
			close( pipe_fds[WRITE_END] );
			dup2( user_syscall_fd, RSC_SOCK );
			break;

		  case CONDOR_UNIVERSE_VANILLA:
			set_iwd();
			open_std_file( 0 );
			open_std_file( 1 );
			open_std_file( 2 );

			(void)close( RSC_SOCK );
			(void)close( CLIENT_LOG );

			break;
		}

			// Make sure we're not root
		if( getuid() == 0 ) {
				// EXCEPT( "We're about to start as root, aborting." );
				// You can't see this error message at all.  So, just
				// exit(4), which is what EXCEPT normally gives. 
			exit( 4 );
		}

#if defined( LINUX ) && (defined(I386) || defined(X86_64))
		// adjust the execution domain of the child to be suitable for
		// checkpointing.
		patch_personality();
#endif 

			// if we're using privsep, we'll exec the PrivSep Switchboard
			// first, which is setuid; it will then setuid to the user we
			// give it and exec the real job
			//
		if (privsep_enabled()) {
			close(fileno(switchboard_in_fp));
			close(fileno(switchboard_err_fp));
			privsep_get_switchboard_command("exec",
			                                switchboard_child_in_fd,
			                                switchboard_child_err_fd,
			                                exec_name,
			                                new_args);
			deleteStringArray(argv);
			argv = new_args.GetStringArray();
		}

			// Everything's ready, start it up...
		errno = 0;
		execve( exec_name.Value(), argv, envp );

			// A successful call to execve() never returns, so it is an
			// error if we get here.  A number of errors are possible
			// but the most likely is that there is insufficient swap
			// space to start the new process.  We don't try to log
			// anything, since we have the UID/GID of the job's owner
			// and cannot write into the log files...
		exit( JOB_EXEC_FAILED );
	}

		// The parent

		// PrivSep - we have at this point only spawned the switchboard
		// with the "exec" command. we need to use our pipe to it in
		// order to tell it how to execute the user job, and then use
		// the error pipe to make sure everything worked
		//
	if (privsep_enabled()) {

		close(switchboard_child_in_fd);
		close(switchboard_child_err_fd);

		privsep_exec_set_uid(switchboard_in_fp, uid);
		privsep_exec_set_path(switchboard_in_fp, exec_name.Value());
		privsep_exec_set_args(switchboard_in_fp, new_args);
		privsep_exec_set_env(switchboard_in_fp, env_obj);
		privsep_exec_set_iwd(switchboard_in_fp, local_dir);
		privsep_exec_set_inherit_fd(switchboard_in_fp, pipe_fds[0]);
		privsep_exec_set_inherit_fd(switchboard_in_fp, RSC_SOCK);
		privsep_exec_set_inherit_fd(switchboard_in_fp, CLIENT_LOG);
		privsep_exec_set_is_std_univ(switchboard_in_fp);
		fclose(switchboard_in_fp);

		if (!privsep_get_switchboard_response(switchboard_err_fp)) {
			EXCEPT("error starting job: "
			           "privsep get_switchboard_response failure");
		}
	}

	dprintf( D_ALWAYS, "Started user job - PID = %d\n", pid );
	if( job_class != CONDOR_UNIVERSE_VANILLA ) {
			// Send the user process its startup environment conditions
		close( pipe_fds[READ_END] );
		cmd_fp = fdopen( pipe_fds[WRITE_END], "w" );
		dprintf( D_ALWAYS, "cmd_fp = %p\n", cmd_fp );

		if( is_restart() ) {
#if 1
			fprintf( cmd_fp, "restart\n" );
			dprintf( D_ALWAYS, "restart\n" );
#else
			fprintf( cmd_fp, "restart %s\n", target_ckpt );
			dprintf( D_ALWAYS, "restart %s\n", target_ckpt );
#endif
			fprintf( cmd_fp, "end\n" );
			dprintf( D_ALWAYS, "end\n" );
		} else {
			fprintf( cmd_fp, "end\n" );
			dprintf( D_ALWAYS, "end\n" );
		}
		fclose( cmd_fp );
	}

	deleteStringArray(argv);
	deleteStringArray(envp);
	state = EXECUTING;

	if( new_reli ) {
		delete new_reli;
	}

	// removed some vanilla-specific code here
	//
	ASSERT(job_class != CONDOR_UNIVERSE_VANILLA);
}
Exemplo n.º 10
0
static bool test_sprintf_MyString() {
    emit_test("Test sprintf overloading for MyString");

    int nchars = 0;
    int rchars = 0;
    MyString dst;
    MyString src;

    nchars = STL_STRING_UTILS_FIXBUF / 2;
    src="";
    for (int j = 0;  j < nchars;  ++j) src += char('0' + (j % 10));
    rchars = sprintf(dst, "%s", src.Value());
    if (dst != src) {
		FAIL;        
    }
    if (rchars != nchars) {
		FAIL;        
    }

    nchars = STL_STRING_UTILS_FIXBUF - 1;
    src="";
    for (int j = 0;  j < nchars;  ++j) src += char('0' + (j % 10));
    rchars = sprintf(dst, "%s", src.Value());
    if (dst != src) {
		FAIL;        
    }
    if (rchars != nchars) {
		FAIL;        
    }

    nchars = STL_STRING_UTILS_FIXBUF;
    src="";
    for (int j = 0;  j < nchars;  ++j) src += char('0' + (j % 10));
    rchars = sprintf(dst, "%s", src.Value());
    if (dst != src) {
		FAIL;        
    }
    if (rchars != nchars) {
		FAIL;        
    }

    nchars = STL_STRING_UTILS_FIXBUF + 1;
    src="";
    for (int j = 0;  j < nchars;  ++j) src += char('0' + (j % 10));
    rchars = sprintf(dst, "%s", src.Value());
    if (dst != src) {
		FAIL;        
    }
    if (rchars != nchars) {
		FAIL;        
    }

    nchars = STL_STRING_UTILS_FIXBUF * 2;
    src="";
    for (int j = 0;  j < nchars;  ++j) src += char('0' + (j % 10));
    rchars = sprintf(dst, "%s", src.Value());
    if (dst != src) {
		FAIL;        
    }
    if (rchars != nchars) {
		FAIL;        
    }

	PASS;
}
Exemplo n.º 11
0
/*
  Open a standard file (0, 1, or 2), given its fd number.
*/
void
open_std_file( int fd )
{
	char	*logical_name = NULL;
	char	*physical_name = NULL;
	char	*file_name;
	int	flags;
	int	success;
	int	real_fd;

	/* First, try the new set of remote lookups */

	success = REMOTE_CONDOR_get_std_file_info( fd, logical_name );
	if(success>=0) {
		success = REMOTE_CONDOR_get_file_info_new(logical_name, physical_name);
	}

	/* If either of those fail, fall back to the old way */

	if(success<0) {
		success = REMOTE_CONDOR_std_file_info( fd, logical_name, &real_fd );
		if(success<0) {
			EXCEPT("Couldn't get standard file info!");
		}
		physical_name = (char *)malloc(strlen(logical_name)+7);
		sprintf(physical_name,"local:%s",logical_name);
	}

	if(fd==0) {
		flags = O_RDONLY;
	} else {
		flags = O_CREAT|O_WRONLY|O_TRUNC;
	}

	/* The starter doesn't have the whole elaborate url mechanism. */
	/* So, just strip off the pathname from the end */

	file_name = strrchr(physical_name,':');
	if(file_name) {
		file_name++;
	} else {
		file_name = physical_name;
	}

	/* Check to see if appending is forced */

	if(strstr(physical_name,"append:")) {
		flags = flags | O_APPEND;
		flags = flags & ~O_TRUNC;
	}

	/* Now, really open the file. */

	real_fd = safe_open_wrapper_follow(file_name,flags,0);
	if(real_fd<0) {
		// Some things, like /dev/null, can't be truncated, so
		// try again w/o O_TRUNC. Jim, Todd and Derek 5/26/99
		flags = flags & ~O_TRUNC;
		real_fd = safe_open_wrapper_follow(file_name,flags,0);
	}

	if(real_fd<0) {
		MyString err;
		err.formatstr("Can't open \"%s\": %s", file_name,strerror(errno));
		dprintf(D_ALWAYS,"%s\n",err.Value());
		REMOTE_CONDOR_report_error(const_cast<char *>(err.Value()));
		exit( 4 );
	} else {
		if( real_fd != fd ) {
			dup2( real_fd, fd );
		}
	}
	free( logical_name );
	free( physical_name );
}
Exemplo n.º 12
0
MyString
which(const MyString &strFilename, const MyString &strAdditionalSearchDirs)
{
	MyString strPath = getenv( EnvGetName( ENV_PATH ) );
	dprintf( D_FULLDEBUG, "Path: %s\n", strPath.Value());

	char path_delim[3];
	sprintf( path_delim, "%c", PATH_DELIM_CHAR );
	StringList listDirectoriesInPath( strPath.Value(), path_delim );

#ifdef WIN32
	int iLength = strFilename.Length();
	if (!strcasecmp(strFilename.Substr(iLength - 4, iLength - 1).Value(), ".dll"))
	{	// if the filename ends in ".dll"
		
		// in order to mimic the behavior of LoadLibrary
		// we need to artificially insert some other stuff in the search path

		/* from MSDN LoadLibrary
			1.) The directory from which the application loaded. 
			2.) The current directory. 
				Windows XP: If HKLM\System\CurrentControlSet\Control\SessionManager\SafeDllSearchMode is 1, the current directory is the last directory searched. The default value is 0. 

			3.) The Windows system directory. Use the GetSystemDirectory function to get the path of this directory. 
				Windows NT/2000/XP: The wname of this directory is System32. 

			4.) Windows NT/2000/XP: The 16-bit Windows system directory. There is no function that obtains the path of this directory, but it is searched. The name of this directory is System. 
			5.) The Windows directory. Use the GetWindowsDirectory function to get the path of this directory. 
			6.) The directories that are listed in the PATH environment variable. 
		*/

		listDirectoriesInPath.rewind();
		listDirectoriesInPath.next();

		// #5
		char psNewDir[MAX_PATH];
		if (GetWindowsDirectory(psNewDir, MAX_PATH) > 0)
			listDirectoriesInPath.insert(psNewDir);
		else
			dprintf( D_FULLDEBUG, "GetWindowsDirectory() failed, err=%d\n", GetLastError());

		listDirectoriesInPath.rewind();
		listDirectoriesInPath.next();

		// #4 
		strcat(psNewDir, "\\System");
		listDirectoriesInPath.insert(psNewDir);

		listDirectoriesInPath.rewind();
		listDirectoriesInPath.next();

		// #3
		if (GetSystemDirectory(psNewDir, MAX_PATH) > 0)
			listDirectoriesInPath.insert(psNewDir);
		else
			dprintf( D_FULLDEBUG, "GetSystemDirectory() failed, err=%d\n", GetLastError());

		listDirectoriesInPath.rewind();
		listDirectoriesInPath.next();

		// #2
		if (_getcwd(psNewDir, MAX_PATH))
			listDirectoriesInPath.insert(psNewDir);
		else
			dprintf( D_FULLDEBUG, "_getcwd() failed, err=%d\n", errno);

		// #1  had better be covered by the user passing in strAdditionalSearchDirs
	}
#endif

	listDirectoriesInPath.rewind();
	listDirectoriesInPath.next();

	// add additional dirs if specified
	if( strAdditionalSearchDirs != "" ) {
		// path_delim was set above
		StringList listAdditionalSearchDirs( strAdditionalSearchDirs.Value(), path_delim );
		listDirectoriesInPath.create_union(listAdditionalSearchDirs, false);
	}
	
	listDirectoriesInPath.rewind();

	const char *psDir;
	while( (psDir = listDirectoriesInPath.next()) )
	{
		dprintf( D_FULLDEBUG, "Checking dir: %s\n", psDir );

		char *psFullDir = dircat(psDir, strFilename.Value());
		MyString strFullDir = psFullDir;
		delete [] psFullDir;

		StatInfo info(strFullDir.Value());
		if( info.Error() == SIGood ) {
			return strFullDir;
		}
	}
	return "";
}
Exemplo n.º 13
0
const char* my_ip_string() {
    static MyString __my_ip_string;
    __my_ip_string = get_local_ipaddr().to_ip_string();
    return __my_ip_string.Value();
}
Exemplo n.º 14
0
// Return our full hostname (with domain) in a static data buffer.
const char* my_full_hostname() {
    static MyString __my_full_hostname;
    __my_full_hostname = get_local_fqdn();
    return __my_full_hostname.Value();
}
Exemplo n.º 15
0
ClassAd*
readJobAd( void )
{
    ClassAd* ad = NULL;
    bool is_stdin = false;
    bool read_something = false;

    ASSERT( job_ad_file );

    if( job_ad_file[0] == '-' && job_ad_file[1] == '\0' ) {
        fp = stdin;
        is_stdin = true;
    } else {
        if (fp == NULL) {
            fp = safe_fopen_wrapper_follow( job_ad_file, "r" );
            if( ! fp ) {
                EXCEPT( "Failed to open ClassAd file (%s): %s (errno %d)",
                        job_ad_file, strerror(errno), errno );
            }
        }
    }

    dprintf( D_FULLDEBUG, "Reading job ClassAd from %s\n",
             is_stdin ? "STDIN" : job_ad_file );

    ad = new ClassAd;
    MyString line;
    while( line.readLine(fp) ) {
        read_something = true;
        line.chomp();
        if( line[0] == '#' ) {
            dprintf( D_JOB, "IGNORING COMMENT: %s\n", line.Value() );
            continue;
        }
        if( line == "***" ) {
            dprintf( D_JOB, "Saw ClassAd delimitor, stopping\n" );
            break;
        }
        if( ! ad->Insert(line.Value()) ) {
            EXCEPT( "Failed to insert \"%s\" into ClassAd!", line.Value() );
        }
    }
    if( ! read_something ) {
        EXCEPT( "reading ClassAd from (%s): file is empty",
                is_stdin ? "STDIN" : job_ad_file );
    }
    if( IsDebugVerbose(D_JOB) ) {
        ad->dPrint( D_JOB );
    }

    // For debugging, see if there's a special attribute in the
    // job ad that sends us into an infinite loop, waiting for
    // someone to attach with a debugger
    int shadow_should_wait = 0;
    ad->LookupInteger( ATTR_SHADOW_WAIT_FOR_DEBUG,
                       shadow_should_wait );
    if( shadow_should_wait ) {
        dprintf( D_ALWAYS, "Job requested shadow should wait for "
                 "debugger with %s=%d, going into infinite loop\n",
                 ATTR_SHADOW_WAIT_FOR_DEBUG, shadow_should_wait );
        while( shadow_should_wait ) { }
    }

    return ad;
}
Exemplo n.º 16
0
void
UserProc::handle_termination( int exit_st )
{
	/* XXX I wonder what the kernel does if a core file is created such that 
		the entire path up to the core file is path max, and when you add the
		pid, it overshoots the path max... */
	MyString corebuf;
	struct stat sbuf;
	int core_name_type = CORE_NAME_UNKNOWN;
	pid_t user_job_pid;

	exit_status = exit_st;
	exit_status_valid = TRUE;
	accumulate_cpu_time();

	if( exit_requested && job_class == CONDOR_UNIVERSE_VANILLA ) { // job exited by request
		dprintf( D_FAILURE|D_ALWAYS, "Process exited by request\n" );
		state = NON_RUNNABLE;
	} else if( WIFEXITED(exit_status) ) { 
                                     // exited on own accord with some status
		dprintf( D_FAILURE|D_ALWAYS,
			"Process %d exited with status %d\n", pid, WEXITSTATUS(exit_status)
		);
		if( WEXITSTATUS(exit_status) == JOB_EXEC_FAILED ) {
			dprintf( D_FAILURE|D_ALWAYS,
				"EXEC of user process failed, probably insufficient swap\n"
			);
			state = NON_RUNNABLE;
		} else {
			state = NORMAL_EXIT;
			commit_cpu_time();
		}
	} else {
		dprintf( D_ALWAYS,
			"Process %d killed by signal %d\n", pid, WTERMSIG(exit_status)
		);
		switch( WTERMSIG(exit_status) ) {
		  case SIGUSR2:			// ckpt and vacate exit 
			dprintf( D_FAILURE|D_ALWAYS, "Process exited for checkpoint\n" );
			state = CHECKPOINTING;
			commit_cpu_time();
			break;
		  case SIGQUIT:			// exited for a checkpoint
			dprintf( D_FAILURE|D_ALWAYS, "Process exited for checkpoint\n" );
			state = CHECKPOINTING;
			/*
			  For bytestream checkpointing:  the only way the process exits
			  with signal SIGQUIT is if has transferred a checkpoint
			  successfully.
			*/
			ckpt_transferred = TRUE;
			break;
		  case SIGUSR1:
		  case SIGKILL:				// exited by request - no ckpt
			dprintf( D_FAILURE|D_ALWAYS, "Process exited by request\n" );
			state = NON_RUNNABLE;
			break;
		  default:					// exited abnormally due to signal
			dprintf( D_FAILURE|D_ALWAYS, "Process exited abnormally\n" );
			state = ABNORMAL_EXIT;
			commit_cpu_time();		// should this be here on ABNORMAL_EXIT ???? -Todd
		}
			
	}
	user_job_pid = pid;
	pid = 0;

	// removed some vanilla-specific code here
	//
	ASSERT(job_class != CONDOR_UNIVERSE_VANILLA);

	priv_state priv;

	switch( state ) {

	    case CHECKPOINTING:
			core_created = FALSE;
			ckpt_transferred = TRUE;
			break;

        case ABNORMAL_EXIT:
			priv = set_root_priv();	// need to be root to access core file

			/* figure out the name of the core file, it could be either
				"core.pid" which we will try first, or "core". We'll take
				the first one we find, and ignore the other one. */

			/* the default */
			core_name_type = CORE_NAME_UNKNOWN;

			/* try the 'core.pid' form first */
			corebuf.formatstr( "%s/core.%lu", local_dir, 
				(unsigned long)user_job_pid);
			if (stat(corebuf.Value(), &sbuf) >= 0) {
				core_name_type = CORE_NAME_KNOWN;
			} else {
				/* now try the normal 'core' form */
				corebuf.formatstr( "%s/core", local_dir);
				if (stat(corebuf.Value(), &sbuf) >= 0) {
					core_name_type = CORE_NAME_KNOWN;
				}
			}

			/* If I know a core file name, then set up its retrival */
			core_created = FALSE; // assume false if unknown
			if (core_name_type == CORE_NAME_KNOWN) {

				/* AFAICT, This is where core_name should be set up,
					if it had already been setup somehow, then that is a logic
					error and I'd need to find out why/where */
				if (core_name != NULL) {
					EXCEPT( "core_name should have been NULL!" );
				}

				/* this gets deleted when this objects destructs */
				core_name = new char [ corebuf.Length() + 1 ];
				if(core_name != NULL) strcpy( core_name,corebuf.Value() );

				/* core_is_valid checks to make sure it isn't a symlink and
					returns false if it is */
					if( core_is_valid(corebuf.Value()) == TRUE) {
					dprintf( D_FAILURE|D_ALWAYS, 
						"A core file was created: %s\n", corebuf.Value());
					core_created = TRUE;
				} else {
					dprintf( D_FULLDEBUG, "No core file was created\n" );
					core_created = FALSE;
					if (core_name != NULL) {
						// remove any incomplete core, or possible symlink
						(void)unlink(corebuf.Value());	
					}
				}
			}

			set_priv(priv);

			break;

		default:
			dprintf( D_FULLDEBUG, "No core file was created\n" );
			core_created = FALSE;
			break;
	}

}
Exemplo n.º 17
0
void
init_daemon_list()
{
	char	*daemon_name;
	StringList daemon_names, dc_daemon_names;

	daemons.ordered_daemon_names.clearAll();
	char* dc_daemon_list = param("DC_DAEMON_LIST");

	if( !dc_daemon_list ) {
		dc_daemon_names.initializeFromString(default_dc_daemon_list);
	}
	else {
		if ( *dc_daemon_list == '+' ) {
			MyString	dclist;
			dclist = default_dc_daemon_list;
			dclist += ", ";
			dclist += &dc_daemon_list[1];
			dc_daemon_names.initializeFromString( dclist.Value() );
		}
		else {
			dc_daemon_names.initializeFromString(dc_daemon_list);

			StringList default_list(default_dc_daemon_list);
			default_list.rewind();
			char *default_entry;
			int	missing = 0;
			while( (default_entry=default_list.next()) ) {
				if( !dc_daemon_names.contains_anycase(default_entry) ) {
					dprintf(D_ALWAYS,
							"WARNING: expected to find %s in"
							" DC_DAEMON_LIST, but it is not there.\n",
							default_entry );
					missing++;
				}
			}
			if ( missing ) {
				dprintf( D_ALWAYS,
						 "WARNING: "
						 "%d entries are missing from DC_DAEMON_LIST.  "
						 "Unless you know what you are doing, it "
						 "is best to leave DC_DAEMON_LIST undefined "
						 "so that the default settings are used, "
						 "or use the new 'DC_DAEMON_LIST = "
						 "+<list>' syntax.\n", missing );
			}
				
		}
		free(dc_daemon_list);
	}

		// Tolerate a trailing comma in the list
	dc_daemon_names.remove( "" );

	char* ha_list = param("MASTER_HA_LIST");
	if( ha_list ) {
			// Make MASTER_HA_LIST case insensitive by always converting
			// what we get to uppercase.
		StringList ha_names;
		ha_list = strupr( ha_list );
		ha_names.initializeFromString(ha_list);
			// Tolerate a trailing comma in the list
		ha_names.remove( "" );
		daemons.ordered_daemon_names.create_union( ha_names, false );

		ha_names.rewind();
		while( (daemon_name = ha_names.next()) ) {
			if(daemons.FindDaemon(daemon_name) == NULL) {
				if( dc_daemon_names.contains(daemon_name) ) {
					new class daemon(daemon_name, true, true );
				} else {
					new class daemon(daemon_name, false, true );
				}
			}
		}
	}

	char* daemon_list = param("DAEMON_LIST");
	if( daemon_list ) {
			// Make DAEMON_LIST case insensitive by always converting
			// what we get to uppercase.
		daemon_list = strupr( daemon_list );
		daemon_names.initializeFromString(daemon_list);
		free( daemon_list );
			// Tolerate a trailing comma in the list
		daemon_names.remove( "" );


			/*
			  Make sure that if COLLECTOR is in the list, put it at
			  the front...  unfortunately, our List template (what
			  StringList is defined in terms of) is amazingly broken
			  with regard to insert() and append().  :( insert()
			  usually means: insert *before* the current position.
			  however, if you're at the end, it inserts before the
			  last valid entry, instead of working like append() as
			  you'd expect.  OR, if you just called rewind() and
			  insert() (to insert at the begining, right?) it works
			  like append() and sticks it at the end!!  ARGH!!!  so,
			  we've got to call next() after rewind() so we really
			  insert it at the front of the list.  UGH!  EVIL!!!
			  Derek Wright <*****@*****.**> 2004-12-23
			*/
		if( daemon_names.contains("COLLECTOR") ) {
			daemon_names.deleteCurrent();
			daemon_names.rewind();
			daemon_names.next();
			daemon_names.insert( "COLLECTOR" );
		}

			// start shared_port first for a cleaner startup
		if( daemon_names.contains("SHARED_PORT") ) {
			daemon_names.deleteCurrent();
			daemon_names.rewind();
			daemon_names.next();
			daemon_names.insert( "SHARED_PORT" );
		}
		else if( SharedPortEndpoint::UseSharedPort() ) {
			if( param_boolean("AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST",true) ) {
				dprintf(D_ALWAYS,"Adding SHARED_PORT to DAEMON_LIST, because USE_SHARED_PORT=true (to disable this, set AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST=False)\n");
				daemon_names.rewind();
				daemon_names.next();
				daemon_names.insert( "SHARED_PORT" );
			}
		}
		daemons.ordered_daemon_names.create_union( daemon_names, false );

		daemon_names.rewind();
		while( (daemon_name = daemon_names.next()) ) {
			if(daemons.FindDaemon(daemon_name) == NULL) {
				if( dc_daemon_names.contains(daemon_name) ) {
					new class daemon(daemon_name);
				} else {
					new class daemon(daemon_name, false);
				}
			}
		}
	} else {
		daemons.ordered_daemon_names.create_union( dc_daemon_names, false );
		for(int i = 0; default_daemon_list[i]; i++) {
			new class daemon(default_daemon_list[i]);
		}
	}

}
Exemplo n.º 18
0
void* DaemonCore::Stats::NewProbe(const char * category, const char * name, int as)
{
   if ( ! this->enabled) return NULL;

   MyString attr;
   attr.formatstr("DC%s_%s", category, name);
   cleanStringForUseAsAttr(attr);

   void * ret = NULL;
   switch (as & (AS_TYPE_MASK | IS_CLASS_MASK))
      {
      case AS_COUNT | IS_RECENT:
         {
         stats_entry_recent<int> * probe =
         Pool.NewProbe< stats_entry_recent<int> >(name,  attr.Value(), as);
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

     case  AS_COUNT | IS_CLS_EMA:
         {
         stats_entry_ema<int>* probe =
         Pool.NewProbe< stats_entry_ema<int> >(name, attr.Value(), as | stats_entry_ema<int>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  STATS_ENTRY_TYPE_DOUBLE | IS_CLS_EMA:
         {
         stats_entry_ema<double>* probe =
         Pool.NewProbe< stats_entry_ema<double> >(name, attr.Value(), as | stats_entry_ema<double>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  AS_COUNT | IS_CLS_SUM_EMA_RATE:
         {
         stats_entry_sum_ema_rate<int>* probe =
         Pool.NewProbe< stats_entry_sum_ema_rate<int> >(name, attr.Value(), as | stats_entry_sum_ema_rate<int>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

     case  STATS_ENTRY_TYPE_DOUBLE | IS_CLS_SUM_EMA_RATE:
         {
         stats_entry_sum_ema_rate<double>* probe =
         Pool.NewProbe< stats_entry_sum_ema_rate<double> >(name, attr.Value(), as | stats_entry_sum_ema_rate<double>::PubDefault);
         probe->ConfigureEMAHorizons(ema_config);
         probe->Clear();
         ret = probe;
         }
         break;

      case AS_ABSTIME | IS_RECENT:
      case AS_RELTIME | IS_RECENT:
         {
         stats_entry_recent<time_t> * probe =
         Pool.NewProbe< stats_entry_recent<time_t> >(name,  attr.Value(), as);
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

      case AS_COUNT | IS_RCT:
#ifdef USE_MIRON_PROBE_FOR_DC_RUNTIME_STATS
         {
         as &= ~(IS_CLASS_MASK);  // strip off IS_RTC class
         as |= IS_CLS_PROBE | IF_RT_SUM; // and set IS_CLS_PROBE & IF_RT_SUM classes
         stats_entry_probe<double> * probe =
         Pool.NewProbe< stats_entry_probe<double> >(name, attr.Value(), as);
         ret = probe;
         }
         break;
#else
          // fall through
#endif
      case AS_RELTIME | IS_RCT:
         {
         stats_recent_counter_timer * probe =
         Pool.NewProbe<stats_recent_counter_timer>(name, attr.Value(), as);
        #if 0 // def DEBUG
         attr += "Debug";
         Pool.AddPublish(attr.Value(), probe, NULL, 0, 
                       (FN_STATS_ENTRY_PUBLISH)&stats_recent_counter_timer::PublishDebug);
        #endif
         probe->SetRecentMax(this->RecentWindowMax / this->RecentWindowQuantum);
         ret = probe;
         }
         break;

      default:
         EXCEPT("unsupported probe type");
         break;
      }

   return ret;
}
Exemplo n.º 19
0
int
main(int argc, char* argv[])
{
  Collectors = NULL;

  HistorySnapshot *historySnapshot;
  SQLQuery queryhor;
  SQLQuery queryver;
  QuillErrCode st;

  void **parameters;
  char *dbconn=NULL;
  bool readfromfile = false,remotequill=false;

  char *dbIpAddr=NULL, *dbName=NULL,*queryPassword=NULL,*quillName=NULL;

  AttrList *ad=0;

  int flag = 1;

  MyString tmp;

  int i;
  parameters = (void **) malloc(NUM_PARAMETERS * sizeof(void *));
  myDistro->Init( argc, argv );

  queryhor.setQuery(HISTORY_ALL_HOR, NULL);
  queryver.setQuery(HISTORY_ALL_VER, NULL);

  longformat=TRUE;   
  for(i=1; i<argc; i++) {
    if(strcmp(argv[i], "-name")==0) {
		i++;
		if (argc <= i) {
			fprintf( stderr,
					 "Error: Argument -name requires the name of a quilld as a parameter\n" );
			exit(1);
		}
		
		if( !(quillName = get_daemon_name(argv[i])) ) {
			fprintf( stderr, "Error: unknown host %s\n",
					 get_host_part(argv[i]) );
			printf("\n");
			print_wrapped_text("Extra Info: The name given with the -name "
							   "should be the name of a condor_quilld process. "
							   "Normally it is either a hostname, or "
							   "\"name@hostname\". "
							   "In either case, the hostname should be the "
							   "Internet host name, but it appears that it "
							   "wasn't.",
							   stderr);
			exit(1);
		}
		tmp.sprintf ("%s == \"%s\"", ATTR_NAME, quillName);      		
		quillQuery.addORConstraint (tmp.Value());

                tmp.sprintf ("%s == \"%s\"", ATTR_SCHEDD_NAME, quillName);
                quillQuery.addORConstraint (tmp.Value());

		remotequill = true;
		readfromfile = false;
    }
    else if (strcmp(argv[i],"-help")==0) {
		Usage(argv[0],0);
    }
  }
  if (i<argc) Usage(argv[0]);
  
  config();
  
	/* This call must happen AFTER config() is called */
  if (checkDBconfig() == true && !readfromfile) {
  	readfromfile = false;
  } else {
		  /* couldn't get DB configuration, so bail out */
    printf("Error: Cannot use DB to get history information\n");
  	exit(1);
  }

  if(readfromfile == false) {
	  if(remotequill) {
		  if (Collectors == NULL) {
			  Collectors = CollectorList::create();
			  if(Collectors == NULL ) {
				  printf("Error: Unable to get list of known collectors\n");
				  exit(1);
			  }
		  }
		  result = Collectors->query ( quillQuery, quillList );
		  if(result != Q_OK) {
			  printf("Fatal Error querying collectors\n");
			  exit(1);
		  }

		  if(quillList.MyLength() == 0) {
			  printf("Error: Unknown quill server %s\n", quillName);
			  exit(1);
		  }
		  
		  quillList.Open();
		  while ((ad = quillList.Next())) {
				  // get the address of the database
			  dbIpAddr = dbName = queryPassword = NULL;
			  if (!ad->LookupString(ATTR_QUILL_DB_IP_ADDR, &dbIpAddr) ||
				  !ad->LookupString(ATTR_QUILL_DB_NAME, &dbName) ||
				  !ad->LookupString(ATTR_QUILL_DB_QUERY_PASSWORD, &queryPassword) || 
				  (ad->LookupBool(ATTR_QUILL_IS_REMOTELY_QUERYABLE,flag) && !flag)) {
				  printf("Error: The quill daemon \"%s\" is not set up "
						 "for database queries\n", 
						 quillName);
				  exit(1);
			  }
		  }
	  }
	  dbconn = getDBConnStr(quillName,dbIpAddr,dbName,queryPassword);
	  historySnapshot = new HistorySnapshot(dbconn);
		  //printf ("\n\n-- Quill: %s : %s : %s\n", quillName, dbIpAddr, dbName);
	  
	  st = historySnapshot->sendQuery(&queryhor, &queryver, longformat, true);
		  //if there's a failure here and if we're not posing a query on a 
		  //remote quill daemon, we should instead query the local file
	  if(st == QUILL_FAILURE) {
		  printf( "-- Database at %s not reachable\n", dbIpAddr);
	  }
		  // query history table
	  if (historySnapshot->isHistoryEmpty()) {
		  printf("No historical jobs in the database\n");
	  }
	  historySnapshot->release();
	  delete(historySnapshot);
  }
  
  
  if(parameters) free(parameters);
  if(dbIpAddr) free(dbIpAddr);
  if(dbName) free(dbName);
  if(queryPassword) free(queryPassword);
  if(quillName) free(quillName);
  if(dbconn) free(dbconn);
  return 0;
}
Exemplo n.º 20
0
// This handler is called when a client wishes to write files from the
// transferd's storage.
int
TransferD::write_files_handler(int cmd, Stream *sock) 
{
	ReliSock *rsock = (ReliSock*)sock;
	MyString capability;
	int protocol = FTP_UNKNOWN;
	TransferRequest *treq = NULL;
	MyString fquser;
	static int transfer_reaper_id = -1;
	ThreadArg *thread_arg;
	int tid;
	ClassAd reqad;
	ClassAd respad;

	cmd = cmd; // quiet the compiler.

	dprintf(D_ALWAYS, "Got TRANSFERD_WRITE_FILES!\n");

	/////////////////////////////////////////////////////////////////////////
	// make sure we are authenticated
	/////////////////////////////////////////////////////////////////////////
	if( ! rsock->triedAuthentication() ) {
		CondorError errstack;
		if( ! SecMan::authenticate_sock(rsock, WRITE, &errstack) ) {
			// we failed to authenticate, we should bail out now
			// since we don't know what user is trying to perform
			// this action.
			// TODO: it'd be nice to print out what failed, but we
			// need better error propagation for that...
			errstack.push( "TransferD::setup_transfer_request_handler()", 42,
				"Failure to register transferd - Authentication failed" );
			dprintf( D_ALWAYS, "setup_transfer_request_handler() "
				"aborting: %s\n",
				errstack.getFullText() );
			refuse( rsock );
			return CLOSE_STREAM;
		} 
	}

	fquser = rsock->getFullyQualifiedUser();


	/////////////////////////////////////////////////////////////////////////
	// Check to see if the capability the client tells us is something that
	// we have knowledge of. We ONLY check the capability and not the
	// identity of the person in question. This allows people of different
	// identities to write files here as long as they had the right 
	// capability. While this might not sound secure, they STILL had to have
	// authenticated as someone this daemon trusts. 
	// Similarly, check the protocol it wants to use as well as ensure that
	// the direction the transfer request was supposed to be is being honored.
	/////////////////////////////////////////////////////////////////////////
	rsock->decode();

	// soak the request ad from the client about what it wants to transfer
	reqad.initFromStream(*rsock);
	rsock->end_of_message();

	reqad.LookupString(ATTR_TREQ_CAPABILITY, capability);

	rsock->encode();

	// do I know of such a capability?
	if (m_treqs.lookup(capability, treq) != 0) {
		// didn't find it. Log it and tell them to leave and close up shop
		respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
		respad.Assign(ATTR_TREQ_INVALID_REASON, "Invalid capability!");
		respad.put(*rsock);
		rsock->end_of_message();

		dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
			"using capability '%s', but there was no such capability. "
			"Access denied.\n", fquser.Value(), capability.Value());
		return CLOSE_STREAM;
	}

	reqad.LookupInteger(ATTR_TREQ_FTP, protocol);

	// am I willing to use this protocol?
	switch(protocol) {
		case FTP_CFTP: // FileTrans protocol, I'm happy.
			break;

		default:
			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"Invalid file transfer protocol!");
			respad.put(*rsock);
			rsock->end_of_message();

			dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
				"using protocol '%d', but I don't support that protocol. "
				"Access denied.\n", fquser.Value(), protocol);
			return CLOSE_STREAM;
	}

	// nsure that this transfer request was of the uploading variety
	if (treq->get_direction() != FTPD_UPLOAD) {
			respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE);
			respad.Assign(ATTR_TREQ_INVALID_REASON, 
				"Transfer Request was not an uploading request!");
			respad.put(*rsock);
			rsock->end_of_message();

			dprintf(D_ALWAYS, "Client identity '%s' tried to write some files "
				"to a transfer request that wasn't expecting to be written. "
				"Access denied.\n", fquser.Value());
	}

	/////////////////////////////////////////////////////////////////////////
	// Tell the client everything was ok.
	/////////////////////////////////////////////////////////////////////////

	respad.Assign(ATTR_TREQ_INVALID_REQUEST, FALSE);
	respad.put(*rsock);
	rsock->end_of_message();

	/////////////////////////////////////////////////////////////////////////
	// Set up a thread (a process under unix) to read ALL of the job files
	// for all of the ads in the TransferRequest.
	/////////////////////////////////////////////////////////////////////////

	// now create a thread, passing in the sock, which uses the file transfer
	// object to accept the files.

	if (transfer_reaper_id == -1) {
		// only set this up ONCE so each and every thread gets one.
		transfer_reaper_id = daemonCore->Register_Reaper(
						"write_files_reaper",
						(ReaperHandlercpp) &TransferD::write_files_reaper,
						"write_files_reaper",
						this
						);
	}

	thread_arg = new ThreadArg(protocol, treq);

	// Start a new thread (process on Unix) to do the work
	tid = daemonCore->Create_Thread(
		(ThreadStartFunc)&TransferD::write_files_thread,
		(void *)thread_arg,
		rsock,
		transfer_reaper_id
		);
	
	if (tid == FALSE) {
		// XXX How do I handle this failure?
	}


	// associate the tid with the request so I can deal with it propery in
	// the reaper
	m_client_to_transferd_threads.insert(tid, treq);

	// The stream is inherited to the thread, who does the transfer and
	// finishes the protocol, but in the parent, I'm closing it.
	return CLOSE_STREAM;
}
Exemplo n.º 21
0
bool condor_sockaddr::from_ip_string(const MyString& ip_string)
{
	return from_ip_string(ip_string.Value());
}
Exemplo n.º 22
0
int
ParallelProc::addEnvVars() 
{
   dprintf ( D_FULLDEBUG, "ParallelProc::addEnvVars()\n" );

	   // Pull the environment out of the job ad...
	Env env;
	MyString env_errors;
	if ( !env.MergeFrom(JobAd,&env_errors) ) {
		dprintf( D_ALWAYS, "Failed to read environment from JobAd: %s\n", 
				 env_errors.Value() );
		return 0;
	}

		// Add the remote spool dir, the "server" directory for
		// condor_chirp to stage files to/from
    MyString spool;
	if ( JobAd->LookupString( ATTR_REMOTE_SPOOL_DIR, spool ) < 1 ) {
		dprintf( D_ALWAYS, "%s not found in JobAd.  Aborting.\n", 
				 ATTR_REMOTE_SPOOL_DIR);
		return 0;
	}

    env.SetEnv( "_CONDOR_REMOTE_SPOOL_DIR", spool.Value() );

		// Add this node's number to CONDOR_PROCNO
	char buf[128];
	sprintf(buf, "%d", Node);
	env.SetEnv("_CONDOR_PROCNO", buf);


		// And put the total number of nodes into CONDOR_NPROC
	int machine_count;
	if ( JobAd->LookupInteger( ATTR_CURRENT_HOSTS, machine_count ) !=  1 ) {
		dprintf( D_ALWAYS, "%s not found in JobAd.  Aborting.\n", 
				 ATTR_CURRENT_HOSTS);
		return 0;
	}

	sprintf(buf, "%d", machine_count);
	env.SetEnv("_CONDOR_NPROCS", buf);

		// Now stick the condor bin directory in front of the path,
		// so user scripts can call condor_config_val
    char *bin = param( "BIN" );
    if ( !bin ) {
        dprintf ( D_ALWAYS, "Can't find BIN "
                  "in config file! Aborting!\n" ); 
        return 0;
    }

	MyString path;
	MyString new_path;
	char *tmp;

	new_path = bin;
	new_path += ":";

	if(env.GetEnv("PATH",path)) {
        // The user gave us a path in env.  Find & alter:
        dprintf ( D_FULLDEBUG, "$PATH in ad:%s\n", path.Value() );

		new_path += path;
	}
	else {
        // User did not specify any env, or there is no 'PATH'
        // in env sent along.  We get $PATH and alter it.

        tmp = getenv( "PATH" );
        if ( tmp ) {
            dprintf ( D_FULLDEBUG, "No Path in ad, $PATH in env\n" );
            dprintf ( D_FULLDEBUG, "before: %s\n", tmp );
			new_path += tmp;
        }
        else {   // no PATH in env.  Make one.
            dprintf ( D_FULLDEBUG, "No Path in ad, no $PATH in env\n" );
			new_path = bin;
        }
    }
	free(bin);
	env.SetEnv("PATH",new_path.Value());

	char *condor_config = getenv( "CONDOR_CONFIG");
	if (condor_config) {
		env.SetEnv("CONDOR_CONFIG", condor_config);
	}
	
	if(IsFulldebug(D_FULLDEBUG)) {
		MyString env_str;
		env.getDelimitedStringForDisplay(&env_str);
		dprintf ( D_FULLDEBUG, "New env: %s\n", env_str.Value() );
	}

        // now put the env back into the JobAd:
	if(!env.InsertEnvIntoClassAd(JobAd,&env_errors)) {
		dprintf( D_ALWAYS, "Unable to update env! Aborting: %s\n",
				 env_errors.Value());
		return 0;
	}

	return 1;
}
Exemplo n.º 23
0
void Defrag::config()
{

	ASSERT( param(m_state_file,"DEFRAG_STATE_FILE") );
	if( m_last_poll==0 ) {
		loadState();
	}

	int old_polling_interval = m_polling_interval;
	m_polling_interval = param_integer("DEFRAG_INTERVAL",600);
	if( m_polling_interval <= 0 ) {
		dprintf(D_ALWAYS,
				"DEFRAG_INTERVAL=%d, so no pool defragmentation "
				"will be done.\n", m_polling_interval);
		if( m_polling_timer != -1 ) {
			daemonCore->Cancel_Timer(m_polling_timer);
			m_polling_timer = -1;
		}
	}
	else if( m_polling_timer >= 0 ) {
		if( old_polling_interval != m_polling_interval ) {
			daemonCore->Reset_Timer_Period(
				m_polling_timer,
				m_polling_interval);
		}
	}
	else {
		time_t now = time(NULL);
		int first_time = 0;
		if( m_last_poll != 0 && now-m_last_poll < m_polling_interval && m_last_poll <= now ) {
			first_time = m_polling_interval - (now-m_last_poll);
		}
		m_polling_timer = daemonCore->Register_Timer(
			first_time,
			m_polling_interval,
			(TimerHandlercpp)&Defrag::poll,
			"Defrag::poll",
			this );
	}
	if( old_polling_interval != m_polling_interval && m_polling_interval > 0 )
	{
		dprintf(D_ALWAYS,
				"Will evaluate defragmentation policy every DEFRAG_INTERVAL="
				"%d seconds.\n", m_polling_interval);
	}

	m_draining_per_hour = param_double("DEFRAG_DRAINING_MACHINES_PER_HOUR",0,0);

	double rate = m_draining_per_hour/3600.0*m_polling_interval;
	m_draining_per_poll = (int)floor(rate + 0.00001);
	if( m_draining_per_poll < 0 ) m_draining_per_poll = 0;

	double error_per_hour = (rate - m_draining_per_poll)/m_polling_interval*3600.0;
	m_draining_per_poll_hour = (int)floor(error_per_hour + 0.00001);
	if( m_draining_per_hour < 0 || m_polling_interval > 3600 ) {
		m_draining_per_hour = 0;
	}

	double error_per_day = (error_per_hour - m_draining_per_poll_hour)*24.0;
	m_draining_per_poll_day = (int)floor(error_per_day + 0.5);
	if( m_draining_per_poll_day < 0 || m_polling_interval > 3600*24 ) {
		m_draining_per_poll_day = 0;
	}
	dprintf(D_ALWAYS,"polling interval %ds, DEFRAG_DRAINING_MACHINES_PER_HOUR = %f/hour = %d/interval + %d/hour + %d/day\n",
			m_polling_interval,m_draining_per_hour,m_draining_per_poll,
			m_draining_per_poll_hour,m_draining_per_poll_day);

	m_max_draining = param_integer("DEFRAG_MAX_CONCURRENT_DRAINING",-1,-1);
	m_max_whole_machines = param_integer("DEFRAG_MAX_WHOLE_MACHINES",-1,-1);

	ASSERT( param(m_defrag_requirements,"DEFRAG_REQUIREMENTS") );
	validateExpr( m_defrag_requirements.c_str(), "DEFRAG_REQUIREMENTS" );

	ASSERT( param(m_whole_machine_expr,"DEFRAG_WHOLE_MACHINE_EXPR") );
	validateExpr( m_whole_machine_expr.c_str(), "DEFRAG_WHOLE_MACHINE_EXPR" );

	ASSERT( param(m_draining_schedule_str,"DEFRAG_DRAINING_SCHEDULE") );
	if( m_draining_schedule_str.empty() ) {
		m_draining_schedule = DRAIN_GRACEFUL;
		m_draining_schedule_str = "graceful";
	}
	else {
		m_draining_schedule = getDrainingScheduleNum(m_draining_schedule_str.c_str());
		if( m_draining_schedule < 0 ) {
			EXCEPT("Invalid draining schedule: %s\n",m_draining_schedule_str.c_str());
		}
	}

	MyString rank;
	param(rank,"DEFRAG_RANK");
	if( rank.IsEmpty() ) {
		m_rank_ad.Delete(ATTR_RANK);
	}
	else {
		if( !m_rank_ad.AssignExpr(ATTR_RANK,rank.Value()) ) {
			EXCEPT("Invalid expression for DEFRAG_RANK: %s\n",
				   rank.Value());
		}
	}

	int update_interval = param_integer("DEFRAG_UPDATE_INTERVAL", 600);
	if(m_public_ad_update_interval != update_interval) {
		m_public_ad_update_interval = update_interval;

		dprintf(D_FULLDEBUG, "Setting update interval to %d\n",
			m_public_ad_update_interval);

		if(m_public_ad_update_timer >= 0) {
			daemonCore->Reset_Timer_Period(
				m_public_ad_update_timer,
				m_public_ad_update_interval);
		}
		else {
			m_public_ad_update_timer = daemonCore->Register_Timer(
				0,
				m_public_ad_update_interval,
				(TimerHandlercpp)&Defrag::updateCollector,
				"Defrag::updateCollector",
				this);
		}
	}

	if (param(m_cancel_requirements, "DEFRAG_CANCEL_REQUIREMENTS")) {
		validateExpr( m_cancel_requirements.c_str(), "DEFRAG_CANCEL_REQUIREMENTS" );
	} else {
		m_cancel_requirements = "";
	}

	param(m_defrag_name,"DEFRAG_NAME");

	int stats_quantum = m_polling_interval;
	int stats_window = 10*stats_quantum;
	m_stats.SetWindowSize(stats_window,stats_quantum);
}
Exemplo n.º 24
0
TreqMode
transfer_mode(MyString mode)
{
	return transfer_mode(mode.Value());
}
Exemplo n.º 25
0
int get_fqdn_and_ip_from_hostname(const MyString& hostname,
		MyString& fqdn, condor_sockaddr& addr) {

	MyString ret;
	condor_sockaddr ret_addr;
	bool found_ip = false;

	// if the hostname contains dot, hostname is assumed to be full hostname
	if (hostname.FindChar('.') != -1) {
		ret = hostname;
	}

	if (nodns_enabled()) {
		// if nodns is enabled, convert hostname to ip address directly
		ret_addr = convert_hostname_to_ipaddr(hostname);
		found_ip = true;
	} else {
		// we look through getaddrinfo and gethostbyname
		// to further seek fully-qualified domain name and corresponding
		// ip address
		addrinfo_iterator ai;
		int res  = ipv6_getaddrinfo(hostname.Value(), NULL, ai);
		if (res) {
			dprintf(D_HOSTNAME, "ipv6_getaddrinfo() could not look up %s: %s (%d)\n", hostname.Value(),
				gai_strerror(res), res);
			return 0;
		}

		while (addrinfo* info = ai.next()) {
			if (info->ai_canonname) {
				fqdn = info->ai_canonname;
				addr = condor_sockaddr(info->ai_addr);
				return 1;
			}
		}

		hostent* h = gethostbyname(hostname.Value());
		if (h && h->h_name && strchr(h->h_name, '.')) {
			fqdn = h->h_name;
			addr = condor_sockaddr((sockaddr*)h->h_addr);
			return 1;
		}
		if (h && h->h_aliases && *h->h_aliases) {
			for (char** alias = h->h_aliases; *alias; ++alias) {
				if (strchr(*alias, '.')) {
					fqdn = *alias;
					addr = condor_sockaddr((sockaddr*)h->h_addr);
					return 1;
				}
			}
		}
	}

	MyString default_domain;

	// if FQDN is still unresolved, try DEFAULT_DOMAIN_NAME
	if (ret.Length() == 0 && param(default_domain, "DEFAULT_DOMAIN_NAME")) {
		ret = hostname;
		if (ret[ret.Length() - 1] != '.')
			ret += ".";
		ret += default_domain;
	}

	if (ret.Length() > 0 && found_ip) {
		fqdn = ret;
		addr = ret_addr;
		return 1;
	}
	return 0;
}
Exemplo n.º 26
0
void Pigeon::initialize() {
  /*        if (m_state == STATE_RUNNING) {
  */
  MyString* qpidPort;
  char *path = NULL;
  //notify us when our process is down.
  m_reaper = daemonCore->Register_Reaper(
      "reaperQpid",
      (ReaperHandlercpp) &Pigeon::reaperResponse,
      "Qpid process reaper", (Service*) this);		

  ASSERT(m_reaper != FALSE);

  //ClassAd Initialization
  //cleanup metafile
  ArgList argClean;
  clean();

  char* proc= param("QPID_EXEC");
  if (!proc) {
  	dprintf(D_ALWAYS, "You need to specify the QPID executable as QPID_EXEC in your condor config \n");
  	EXCEPT("No qpid executable (QPID_EXEC) specified!");
  }
  MyString hostname = get_local_fqdn();
  
  ArgList arglist; 
  arglist.AppendArg("qpidd");
  char *qpidConf = param("QPID_CONF");
  if (qpidConf) {
  	arglist.AppendArg("--config");
  	arglist.AppendArg(qpidConf);
  	free(qpidConf);
  } else {
  	
  	arglist.AppendArg("-p0");
  	arglist.AppendArg("--auth");
  	arglist.AppendArg("no");
  }
  
  
  MyString argString;
  arglist.GetArgsStringForDisplay(&argString);
  dprintf(D_ALWAYS, "\n Invoking: %s\n", argString.Value());
  path = getPortPath();
  int fd_stdout = safe_open_wrapper(path, O_RDWR|O_CREAT, 0666);
  free(path);
  int fds[3] = {-1, fd_stdout, -1};
  int mm_pid = daemonCore->Create_Process(proc,arglist,PRIV_CONDOR_FINAL, 0,FALSE,FALSE,NULL,NULL,NULL,NULL,fds);
  if (mm_pid <= 0) 
    EXCEPT("Failed to launch qpid process using Create_Process.");

  dprintf(D_ALWAYS, "Launched qpid process pid=%d \n", mm_pid);
  sleep(10);
  close(fd_stdout);
 
  char *portChr = getPort(false);
  string portStr = string(portChr);
  free(portChr);
  free(proc);
  if(strcmp(portStr.c_str(),"") != 0){
    m_qpidAd.Assign("PORT", portStr.c_str());
    dprintf(D_ALWAYS,"qpid process started on port number %s \n", portStr.c_str());
  }  
  SetMyTypeName(m_qpidAd, "pigeon");
  SetTargetTypeName(m_qpidAd, "");
  std::string hostAddr = "pigeon@";
  hostAddr += hostname.Value();
  m_qpidAd.Assign(ATTR_NAME, "pigeon"); //hostAddr.c_str());
  m_qpidAd.Assign("Key", "qpidKey");
  m_qpidAd.Assign("IP","128" );
  daemonCore->publish(&m_qpidAd); 

  //Register a timer for periodically pushing classads.
  //TODO: Make these rate and interval configurable
  dprintf(D_ALWAYS, "Calling the classAd publish()\n");
  daemonCore->Register_Timer(1, m_adPubInterval, (TimerHandlercpp) &Pigeon::publishClassAd, 
      "publishClassAd", this);

  dprintf(D_ALWAYS, "Launched qpid process pid=%d at port=|%s|\n", mm_pid,portStr.c_str());
  
  
  char *execDir = param("SBIN");
  if (execDir) {
  	dprintf(D_ALWAYS, "Declaring queues...  \n");
  	ArgList qArglist;
  	proc = (char*)malloc(strlen(execDir) + 15);
  	sprintf(proc, "%s%c%s",execDir, DIR_DELIM_CHAR, "declareQueues");
  	qArglist.AppendArg(proc);
  	qArglist.AppendArg(hostname);
  	qArglist.AppendArg(portStr.c_str());
  	mm_pid = daemonCore->Create_Process(proc,qArglist,PRIV_CONDOR_FINAL, 0,FALSE,FALSE,NULL,NULL,NULL,NULL);
  	if (mm_pid <= 0) 
		EXCEPT("Failed to launch declareQueues process using Create_Process.");
    free(proc);
    free(execDir);
	dprintf(D_ALWAYS, "QPID queues declared. \n");
   }
}
Exemplo n.º 27
0
void
fillRequirements( ClassAd* req )
{
	MyString jic_req = "TARGET.";
	if (jobad_path) {
		jic_req += ATTR_HAS_JIC_LOCAL_STDIN;
	} else {
		jic_req += ATTR_HAS_JIC_LOCAL_CONFIG;
	}
	jic_req += "==TRUE";

	MyString require;
	require = ATTR_REQUIREMENTS;
	require += "=(";

	if (requirements) {
		require += requirements;
		require += ")&&(";
	}
	int slot_id = 0;
	if (name) {
		if (sscanf(name, "slot%d@", &slot_id) != 1) { 
			slot_id = 0;
		}
	}
	if (slot_id > 0) {
		require += "TARGET.";
		require += ATTR_SLOT_ID;
		require += "==";
		require += slot_id;
		require += ")&&(";
	}
	else if (param_boolean("ALLOW_VM_CRUFT", false)) {
		int vm_id = 0;
		if (name) {
			if (sscanf(name, "vm%d@", &vm_id) != 1) { 
				vm_id = 0;
			}
		}
		if (vm_id > 0) {
			require += "TARGET.";
			require += ATTR_VIRTUAL_MACHINE_ID;
			require += "==";
			require += vm_id;
			require += ")&&(";
		}
	}

	require += jic_req;
	require += ')';

	if( ! req->Insert(require.Value()) ) {
		fprintf( stderr, "ERROR: can't parse requirements '%s'\n",
				 requirements );
		exit( 1 );
	}

#if defined(ADD_TARGET_SCOPING)
		// The user may have entered some references to machine attributes
		// without explicitly specifying the TARGET scope.
	req->AddTargetRefs( TargetMachineAttrs );
#endif

}
Exemplo n.º 28
0
bool
Email::writeExit( ClassAd* ad, int exit_reason )
{
		// if we're not currently open w/ a message, we're done
	if( ! fp ) {
		return false;
	}

		// gather all the info out of the job ad which we want to 
		// put into the email message.

	int had_core = FALSE;
	if( ! ad->LookupBool(ATTR_JOB_CORE_DUMPED, had_core) ) {
		if( exit_reason == JOB_COREDUMPED ) {
			had_core = TRUE;
		}
	}

		// TODO ATTR_JOB_CORE_FILE_NAME...
		// we need the IWD in both cases...

	int q_date = 0;
	ad->LookupInteger( ATTR_Q_DATE, q_date );
	
	double remote_sys_cpu = 0.0;
	ad->LookupFloat( ATTR_JOB_REMOTE_SYS_CPU, remote_sys_cpu );
	
	double remote_user_cpu = 0.0;
	ad->LookupFloat( ATTR_JOB_REMOTE_USER_CPU, remote_user_cpu );
	
	int image_size = 0;
	ad->LookupInteger( ATTR_IMAGE_SIZE, image_size );
	
	int shadow_bday = 0;
	ad->LookupInteger( ATTR_SHADOW_BIRTHDATE, shadow_bday );
	
	double previous_runs = 0;
	ad->LookupFloat( ATTR_JOB_REMOTE_WALL_CLOCK, previous_runs );
	
	time_t arch_time=0;	/* time_t is 8 bytes some archs and 4 bytes on other
						   archs, and this means that doing a (time_t*)
						   cast on & of a 4 byte int makes my life hell.
						   So we fix it by assigning the time we want to
						   a real time_t variable, then using ctime()
						   to convert it to a string */
	
	time_t now = time(NULL);

	writeJobId( ad );
	MyString msg;
	if( ! printExitString(ad, exit_reason, msg)	) {
		msg += "exited in an unknown way";
	}
	fprintf( fp, "%s\n", msg.Value() );

	if( had_core ) {
			// TODO!
			// fprintf( fp, "Core file is: %s\n", getCoreName() );
		fprintf( fp, "Core file generated\n" );
	}

	arch_time = q_date;
	fprintf(fp, "\n\nSubmitted at:        %s", ctime(&arch_time));
	
	if( exit_reason == JOB_EXITED || exit_reason == JOB_COREDUMPED ) {
		double real_time = now - q_date;
		arch_time = now;
		fprintf(fp, "Completed at:        %s", ctime(&arch_time));
		
		fprintf(fp, "Real Time:           %s\n", 
				d_format_time(real_time));
	}	

	fprintf( fp, "\n" );
	
	fprintf(fp, "Virtual Image Size:  %d Kilobytes\n\n", image_size);
	
	double rutime = remote_user_cpu;
	double rstime = remote_sys_cpu;
	double trtime = rutime + rstime;
	double wall_time = 0;
	fprintf(fp, "Statistics from last run:\n");
	if(shadow_bday != 0) {	// Handle cases where this wasn't set (grid)
		wall_time = now - shadow_bday;
	}
	fprintf(fp, "Allocation/Run time:     %s\n",d_format_time(wall_time) );
	fprintf(fp, "Remote User CPU Time:    %s\n", d_format_time(rutime) );
	fprintf(fp, "Remote System CPU Time:  %s\n", d_format_time(rstime) );
	fprintf(fp, "Total Remote CPU Time:   %s\n\n", d_format_time(trtime));
	
	double total_wall_time = previous_runs + wall_time;
	fprintf(fp, "Statistics totaled from all runs:\n");
	fprintf(fp, "Allocation/Run time:     %s\n",
			d_format_time(total_wall_time) );

		// TODO: deal w/ bytes directly from the classad
	return true;
}
Exemplo n.º 29
0
void
BaseShadow::baseInit( ClassAd *job_ad, const char* schedd_addr, const char *xfer_queue_contact_info )
{
	int pending = FALSE;

	if( ! job_ad ) {
		EXCEPT("baseInit() called with NULL job_ad!");
	}
	jobAd = job_ad;

	if (sendUpdatesToSchedd && ! is_valid_sinful(schedd_addr)) {
		EXCEPT("schedd_addr not specified with valid address");
	}
	scheddAddr = sendUpdatesToSchedd ? strdup( schedd_addr ) : strdup("noschedd");

	m_xfer_queue_contact_info = xfer_queue_contact_info;

	if ( !jobAd->LookupString(ATTR_OWNER, owner)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_OWNER);
	}

	if( !jobAd->LookupInteger(ATTR_CLUSTER_ID, cluster)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_CLUSTER_ID);
	}

	if( !jobAd->LookupInteger(ATTR_PROC_ID, proc)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_PROC_ID);
	}


		// Grab the GlobalJobId if we've got it.
	if( ! jobAd->LookupString(ATTR_GLOBAL_JOB_ID, &gjid) ) {
		gjid = NULL;
	}

	// grab the NT domain if we've got it
	jobAd->LookupString(ATTR_NT_DOMAIN, domain);
	if ( !jobAd->LookupString(ATTR_JOB_IWD, iwd)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_JOB_IWD);
	}

	if( !jobAd->LookupFloat(ATTR_BYTES_SENT, prev_run_bytes_sent) ) {
		prev_run_bytes_sent = 0;
	}
	if( !jobAd->LookupFloat(ATTR_BYTES_RECVD, prev_run_bytes_recvd) ) {
		prev_run_bytes_recvd = 0;
	}

		// construct the core file name we'd get if we had one.
	MyString tmp_name = iwd;
	tmp_name += DIR_DELIM_CHAR;
	tmp_name += "core.";
	tmp_name += cluster;
	tmp_name += '.';
	tmp_name += proc;
	core_file_name = strdup( tmp_name.Value() );

        // put the shadow's sinful string into the jobAd.  Helpful for
        // the mpi shadow, at least...and a good idea in general.
	MyString tmp_addr = ATTR_MY_ADDRESS;
	tmp_addr += "=\"";
	tmp_addr += daemonCore->InfoCommandSinfulString();
	tmp_addr += '"';
    if ( !jobAd->Insert( tmp_addr.Value() )) {
        EXCEPT( "Failed to insert %s!", ATTR_MY_ADDRESS );
    }

	DebugId = display_dprintf_header;
	
	config();

		// Make sure we've got enough swap space to run
	checkSwap();

	// handle system calls with Owner's privilege
// XXX this belong here?  We'll see...
	// Calling init_user_ids() while in user priv causes badness.
	// Make sure we're in another priv state.
	set_condor_priv();
	if ( !init_user_ids(owner.Value(), domain.Value())) {
		dprintf(D_ALWAYS, "init_user_ids() failed as user %s\n",owner.Value() );
		// uids.C will EXCEPT when we set_user_priv() now
		// so there's not much we can do at this point
		
#if ! defined(WIN32)
		if ( param_boolean( "SHADOW_RUN_UNKNOWN_USER_JOBS", false ) )
		{
			dprintf(D_ALWAYS, "trying init_user_ids() as user nobody\n" );
			
			owner="nobody";
			domain=NULL;
			if (!init_user_ids(owner.Value(), domain.Value()))
			{
				dprintf(D_ALWAYS, "init_user_ids() failed!\n");
			}
			else
			{
				jobAd->Assign( ATTR_JOB_RUNAS_OWNER, "FALSE" );
				m_RunAsNobody=true;
				dprintf(D_ALWAYS, "init_user_ids() now running as user nobody\n");
			}
		}
#endif

	}
	set_user_priv();
	daemonCore->Register_Priv_State( PRIV_USER );

	dumpClassad( "BaseShadow::baseInit()", this->jobAd, D_JOB );

		// initialize the UserPolicy object
	shadow_user_policy.init( jobAd, this );

		// setup an object to keep our job ad updated to the schedd's
		// permanent job queue.  this clears all the dirty bits on our
		// copy of the classad, so anything we touch after this will
		// be updated to the schedd when appropriate.

		// Unless we got a command line arg asking us not to
	if (sendUpdatesToSchedd) {
		// the usual case
		job_updater = new QmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	} else {
		job_updater = new NullQmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	}

		// init user log; hold on failure
		// NOTE: job_updater must be initialized _before_ initUserLog(),
		// in order to handle the case of the job going on hold as a
		// result of failure in initUserLog().
	initUserLog();

		// change directory; hold on failure
	if ( cdToIwd() == -1 ) {
		EXCEPT("Could not cd to initial working directory");
	}

		// check to see if this invocation of the shadow is just to write
		// a terminate event and exit since this job had been recorded as
		// pending termination, but somehow the job didn't leave the queue
		// and the schedd is trying to restart it again..
	if( jobAd->LookupInteger(ATTR_TERMINATION_PENDING, pending)) {
		if (pending == TRUE) {
			// If the classad of this job "thinks" that this job should be
			// finished already, let's enact that belief.
			// This function does not return.
			this->terminateJob(US_TERMINATE_PENDING);
		}
	}

		// If we need to claim the startd before activating the claim
	int wantClaiming = 0;
	jobAd->LookupBool(ATTR_CLAIM_STARTD, wantClaiming);
	if (wantClaiming) {
		MyString startdSinful;
		MyString claimid;

			// Pull startd addr and claimid out of the jobad
		jobAd->LookupString(ATTR_STARTD_IP_ADDR, startdSinful);
		jobAd->LookupString(ATTR_CLAIM_ID, claimid);

		dprintf(D_ALWAYS, "%s is true, trying to claim startd %s\n", ATTR_CLAIM_STARTD, startdSinful.Value());

		classy_counted_ptr<DCStartd> startd = new DCStartd("description", NULL, startdSinful.Value(), claimid.Value());
	
		classy_counted_ptr<DCMsgCallback> cb = 
			new DCMsgCallback((DCMsgCallback::CppFunction)&BaseShadow::startdClaimedCB,
			this, jobAd);
																 
			// this can't fail, will always call the callback
		startd->asyncRequestOpportunisticClaim(jobAd, 
											   "description", 
											   daemonCore->InfoCommandSinfulString(), 
											   1200 /*alive interval*/, 
											   20 /* net timeout*/, 
											   100 /*total timeout*/, 
											   cb);
	}
}
Exemplo n.º 30
0
extern "C" void
log_termination (struct rusage *localr, struct rusage *remoter)
{
	check_execute_event();

	switch (WTERMSIG(JobStatus))
	{
	  case 0:
	  case -1: 
		// if core, bad exectuable --- otherwise, a normal exit
		if (WCOREDUMP(JobStatus) && WEXITSTATUS(JobStatus) == ENOEXEC)
		{
			// log the ULOG_EXECUTABLE_ERROR event
			ExecutableErrorEvent event;
			event.errType = CONDOR_EVENT_NOT_EXECUTABLE;
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS, "Unable to log NOT_EXECUTABLE event\n");
			}
		}
		else
		if (WCOREDUMP(JobStatus) && WEXITSTATUS(JobStatus) == 0)
		{		
			// log the ULOG_EXECUTABLE_ERROR event
			ExecutableErrorEvent event;
			event.errType = CONDOR_EVENT_BAD_LINK;
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS, "Unable to log BAD_LINK event\n");
			}
		}
		else
		{
			// log the ULOG_JOB_TERMINATED event
			JobTerminatedEvent event;
			event.normal = true; // normal termination
			event.returnValue = WEXITSTATUS(JobStatus);
			event.total_local_rusage = Proc->local_usage;
			event.total_remote_rusage = Proc->remote_usage[0];
			event.run_local_rusage = *localr;
			event.run_remote_rusage = *remoter;
			// we want to log the events from the perspective of the
			// user job, so if the shadow *sent* the bytes, then that
			// means the user job *received* the bytes
			event.recvd_bytes = BytesSent;
			event.sent_bytes = BytesRecvd;
			if (syscall_sock) {
				event.recvd_bytes += syscall_sock->get_bytes_sent();
				event.sent_bytes += syscall_sock->get_bytes_recvd();
			}
			event.total_recvd_bytes = TotalBytesSent + event.recvd_bytes;
			event.total_sent_bytes = TotalBytesRecvd + event.sent_bytes;
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS,"Unable to log ULOG_JOB_TERMINATED event\n");
			}
		}
		break;


	  case SIGKILL:
		// evicted without a checkpoint
		{
			JobEvictedEvent event;
			event.checkpointed = false;
			event.run_local_rusage = *localr;
			event.run_remote_rusage = *remoter;
			// we want to log the events from the perspective of the
			// user job, so if the shadow *sent* the bytes, then that
			// means the user job *received* the bytes
			event.recvd_bytes = BytesSent;
			event.sent_bytes = BytesRecvd;
			if (syscall_sock) {
				event.recvd_bytes += syscall_sock->get_bytes_sent();
				event.sent_bytes += syscall_sock->get_bytes_recvd();
			}
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS, "Unable to log ULOG_JOB_EVICTED event\n");
			}
		}
		break;

			
	  case SIGQUIT:
		// evicted, but *with* a checkpoint
		{
			JobEvictedEvent event;
			event.checkpointed = true;
			event.run_local_rusage = *localr;
			event.run_remote_rusage = *remoter;
			// we want to log the events from the perspective of the
			// user job, so if the shadow *sent* the bytes, then that
			// means the user job *received* the bytes
			event.recvd_bytes = BytesSent;
			event.sent_bytes = BytesRecvd;
			if (syscall_sock) {
				event.recvd_bytes += syscall_sock->get_bytes_sent();
				event.sent_bytes += syscall_sock->get_bytes_recvd();
			}
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS, "Unable to log ULOG_JOB_EVICTED event\n");
			}
		}
		break;


	  default:
		// abnormal termination
		{
			MyString coredir;
			MyString coreFile;
			JobTerminatedEvent event;
			event.normal = false;
			event.signalNumber = WTERMSIG(JobStatus);

			if (WCOREDUMP(JobStatus))
			{
				/* look up the corefile name in the job ad if one exists... */
				if (!JobAd->LookupString(ATTR_JOB_CORE_FILENAME, coreFile)) {
					/* if it didn't exist in the job ad, then construct what it
						should be. */

					ASSERT( condor_getcwd(coredir) );

					if (strcmp (Proc->rootdir, "/") == 0)
					{
						coreFile.formatstr( "%s/core.%d.%d", coredir.Value(),
							 	Proc->id.cluster, Proc->id.proc );
					}
					else
					{
						coreFile.formatstr( "%s%s/core.%d.%d", Proc->rootdir,
							 	coredir.Value(), Proc->id.cluster, Proc->id.proc );
					}
				} 
				
				event.setCoreFile( coreFile.Value() );
			}
			event.run_local_rusage = *localr;
			event.run_remote_rusage = *remoter;
			event.total_local_rusage = Proc->local_usage;
			event.total_remote_rusage = Proc->remote_usage[0];
			// we want to log the events from the perspective of the
			// user job, so if the shadow *sent* the bytes, then that
			// means the user job *received* the bytes
			event.recvd_bytes = BytesSent;
			event.sent_bytes = BytesRecvd;
			if (syscall_sock) {
				event.recvd_bytes += syscall_sock->get_bytes_sent();
				event.sent_bytes += syscall_sock->get_bytes_recvd();
			}
			if (!ULog.writeEvent (&event))
			{
				dprintf (D_ALWAYS,"Unable to log ULOG_JOB_TERMINATED event\n");
			}
		}
	}
}