示例#1
0
void BaseShadow::config()
{
	reconnect_ceiling = param_integer( "RECONNECT_BACKOFF_CEILING", 300 );

	reconnect_e_factor = 0.0;
	reconnect_e_factor = param_double( "RECONNECT_BACKOFF_FACTOR", 2.0, 0.0 );
	if( reconnect_e_factor < -1e-4 || reconnect_e_factor > 1e-4) {
    	reconnect_e_factor = 2.0;
    }

	m_cleanup_retry_tid = -1;
	m_num_cleanup_retries = 0;
		// NOTE: these config knobs are very similar to
		// LOCAL_UNIVERSE_MAX_JOB_CLEANUP_RETRIES and
		// LOCAL_UNIVERSE_JOB_CLEANUP_RETRY_DELAY in the local starter.
	m_max_cleanup_retries = param_integer("SHADOW_MAX_JOB_CLEANUP_RETRIES", 5);
	m_cleanup_retry_delay = param_integer("SHADOW_JOB_CLEANUP_RETRY_DELAY", 30);

	m_lazy_queue_update = param_boolean("SHADOW_LAZY_QUEUE_UPDATE", true);
}
示例#2
0
// functions to make the hashkeys ...
// make hashkeys from the obtained ad
bool
makeStartdAdHashKey (AdNameHashKey &hk, const ClassAd *ad )
{

	// get the name of the startd;
	// this gets complicated with ID
	if ( !adLookup( "Start", ad, ATTR_NAME, NULL, hk.name, false ) ) {
		logWarning( "Start", ATTR_NAME, ATTR_MACHINE, ATTR_SLOT_ID );

		// Get the machine name; if it's not there, give up
		if ( !adLookup( "Start", ad, ATTR_MACHINE, NULL, hk.name, false ) ) {
			logError( "Start", ATTR_NAME, ATTR_MACHINE );
			return false;
		}
		// Finally, if there is a slot ID, append it.
		int	slot;
		if (ad->LookupInteger( ATTR_SLOT_ID, slot)) {
			hk.name += ":";
			hk.name += IntToStr( slot );
		}
		else if (param_boolean("ALLOW_VM_CRUFT", false) &&
				 ad->LookupInteger(ATTR_VIRTUAL_MACHINE_ID, slot)) {
			hk.name += ":";
			hk.name += IntToStr( slot );
		}
	}

	hk.ip_addr = "";
	// As of 7.5.0, we look for MyAddress.  Prior to that, we did not,
	// so new startds must continue to send StartdIpAddr to remain
	// compatible with old collectors.
	if ( !getIpAddr( "Start", ad, ATTR_MY_ADDRESS, ATTR_STARTD_IP_ADDR,
					 hk.ip_addr ) ) {
		dprintf (D_FULLDEBUG,
				 "StartAd: No IP address in classAd from %s\n",
				 hk.name.Value() );
	}

	return true;
}
示例#3
0
int CollectorEngine::
scheduleHousekeeper (int timeout)
{
	// Are we filtering updates that we forward to the view collector?
	std::string watch_list;
	param(watch_list,"COLLECTOR_FORWARD_WATCH_LIST", "State,Cpus,Memory,IdleJobs,ClaimId,Capability,ClaimIdList,ChildClaimIds");
	m_forwardWatchList.clearAll();
	m_forwardWatchList.initializeFromString(watch_list.c_str());

	m_forwardFilteringEnabled = param_boolean( "COLLECTOR_FORWARD_FILTERING", false );

	// cancel outstanding housekeeping requests
	if (housekeeperTimerID != -1)
	{
		(void) daemonCore->Cancel_Timer(housekeeperTimerID);
	}

	// reset for new timer
	if (timeout < 0)
		return 0;

	// set to new timeout interval
	machineUpdateInterval = timeout;

	m_forwardInterval = param_integer("COLLECTOR_FORWARD_INTERVAL", machineUpdateInterval / 3, 0);

	// if timeout interval was non-zero (i.e., housekeeping required) ...
	if (timeout > 0)
	{
		// schedule housekeeper
		housekeeperTimerID = daemonCore->Register_Timer(machineUpdateInterval,
						machineUpdateInterval,
						(TimerHandlercpp)&CollectorEngine::housekeeper,
						"CollectorEngine::housekeeper",this);
		if (housekeeperTimerID == -1)
			return 0;
	}

	return 1;
}
示例#4
0
GlobusResource::GlobusResource( const char *resource_name,
								const Proxy *proxy, bool is_gt5 )
	: BaseResource( resource_name )
{
	initialized = false;
	proxySubject = strdup( proxy->subject->subject_name );
	proxyFQAN = strdup( proxy->subject->fqan );

	if ( param_boolean( "GRAM_VERSION_DETECTION", true ) ) {
		m_isGt5 = false;
		m_versionKnown = false;
	} else {
		m_isGt5 = is_gt5;
		m_versionKnown = true;
	}

	submitJMLimit = DEFAULT_MAX_JOBMANAGERS_PER_RESOURCE / 2;
	restartJMLimit = DEFAULT_MAX_JOBMANAGERS_PER_RESOURCE - submitJMLimit;

	checkMonitorTid = daemonCore->Register_Timer( TIMER_NEVER,
							(TimerHandlercpp)&GlobusResource::CheckMonitor,
							"GlobusResource::CheckMonitor", (Service*)this );
	monitorActive = false;
	monitorSubmitActive = false;
	monitorStarting = false;

	monitorDirectory = NULL;
	monitorJobStatusFile = NULL;
	monitorLogFile = NULL;
	logFileLastReadTime = 0;
	jobStatusFileLastUpdate = 0;
	monitorGramJobId = NULL;
	gahp = NULL;
	monitorGahp = NULL;
	monitorRetryTime = 0;
	monitorFirstStartup = true;
	monitorGramJobStatus = GLOBUS_GRAM_PROTOCOL_JOB_STATE_UNKNOWN;
	monitorGramErrorCode = 0;
}
示例#5
0
void
check_uid_for_privsep()
{
#if !defined(WIN32)
	if (param_boolean("PRIVSEP_ENABLED", false) && (getuid() == 0)) {
		uid_t condor_uid = get_condor_uid();
		if (condor_uid == 0) {
			EXCEPT("PRIVSEP_ENABLED set, but current UID is 0 "
			           "and condor UID is also set to root");
		}
		dprintf(D_ALWAYS,
		        "PRIVSEP_ENABLED set, but UID is 0; "
		            "will drop to UID %u and restart\n",
		        (unsigned)condor_uid);
		daemons.CleanupBeforeRestart();
		set_condor_priv_final();
		daemons.ExecMaster();
		EXCEPT("attempt to restart (via exec) failed (%s)",
		       strerror(errno));
	}
#endif
}
bool
GLExecPrivSepHelper::chown_sandbox_to_user(PrivSepError &err)
{
	ASSERT(m_initialized);

	if (m_sandbox_owned_by_user) {
		dprintf(D_FULLDEBUG,
		        "GLExecPrivSepHelper::chown_sandbox_to_user: "******"sandbox already user-owned\n");
		return true;
	}

	dprintf(D_FULLDEBUG, "changing sandbox ownership to the user\n");

	ArgList args;
	args.AppendArg(m_setup_script);
	args.AppendArg(m_glexec);
	args.AppendArg(m_proxy);
	args.AppendArg(m_sandbox);
	args.AppendArg(m_glexec_retries);
	args.AppendArg(m_glexec_retry_delay);
	MyString error_desc = "error changing sandbox ownership to the user: "******"GLEXEC_HOLD_ON_INITIAL_FAILURE",true) ) {
			// Do not put the job on hold due to glexec failure.
			// It will simply return to idle status and try again.
			hold_code = 0;
		}

		err.setHoldInfo( hold_code, rc, error_desc.Value());
		return false;
	}

	m_sandbox_owned_by_user = true;
	return true;
}
示例#7
0
void BaseShadow::config()
{
	if (spool) free(spool);
	spool = param("SPOOL");
	if (!spool) {
		EXCEPT("SPOOL not specified in config file.");
	}

	if (fsDomain) free(fsDomain);
	fsDomain = param( "FILESYSTEM_DOMAIN" );
	if (!fsDomain) {
		EXCEPT("FILESYSTEM_DOMAIN not specified in config file.");
	}

	if (uidDomain) free(uidDomain);
	uidDomain = param( "UID_DOMAIN" );
	if (!uidDomain) {
		EXCEPT("UID_DOMAIN not specified in config file.");
	}

	reconnect_ceiling = param_integer( "RECONNECT_BACKOFF_CEILING", 300 );

	reconnect_e_factor = 0.0;
	reconnect_e_factor = param_double( "RECONNECT_BACKOFF_FACTOR", 2.0, 0.0 );
	if( reconnect_e_factor < -1e-4 || reconnect_e_factor > 1e-4) {
    	reconnect_e_factor = 2.0;
    }

	m_cleanup_retry_tid = -1;
	m_num_cleanup_retries = 0;
		// NOTE: these config knobs are very similar to
		// LOCAL_UNIVERSE_MAX_JOB_CLEANUP_RETRIES and
		// LOCAL_UNIVERSE_JOB_CLEANUP_RETRY_DELAY in the local starter.
	m_max_cleanup_retries = param_integer("SHADOW_MAX_JOB_CLEANUP_RETRIES", 5);
	m_cleanup_retry_delay = param_integer("SHADOW_JOB_CLEANUP_RETRY_DELAY", 30);

	m_lazy_queue_update = param_boolean("SHADOW_LAZY_QUEUE_UPDATE", true);
}
示例#8
0
bool
JobInfoCommunicator::allowRunAsOwner( bool default_allow, bool default_request )
{
    ASSERT( job_ad );

    // First check if our policy allows RunAsOwner
    // Eval as an expression so a policy such as this can be specified:
    // TARGET.RunAsOwner =?= True

    bool run_as_owner = param_boolean( "STARTER_ALLOW_RUNAS_OWNER",
                                       default_allow, true, NULL, job_ad );

    // Next check if the job has requested runas_owner
    if( run_as_owner ) {
        bool user_wants_runas_owner = default_request;
        job_ad->LookupBool(ATTR_JOB_RUNAS_OWNER,user_wants_runas_owner);
        if ( !user_wants_runas_owner ) {
            run_as_owner = false;
        }
    }

    return run_as_owner;
}
ResourceRequestList::ResourceRequestList(int protocol_version)
	: m_send_end_negotiate(false),
	m_send_end_negotiate_now(false),
	m_requests_to_fetch(0)
{
	m_protocol_version = protocol_version;
	m_clear_rejected_autoclusters = false;
	m_use_resource_request_counts = param_boolean("USE_RESOURCE_REQUEST_COUNTS",true);
	if ( protocol_version == 0 || m_use_resource_request_counts == false ) {
		// Protocol version is 0, and schedd resource request lists were introduced
		// in protocol version 1.  so set m_num_to_fetch to 1 so we use the old
		// protocol of getting one request at a time with this old schedd.
		// Also we must use the old protocol if admin disabled USE_RESOURCE_REQUEST_COUNTS,
		// since the new protocol relies on that.
		m_num_to_fetch = 1;
	} else {
		m_num_to_fetch = param_integer("NEGOTIATOR_RESOURCE_REQUEST_LIST_SIZE");
	}
	errcode = 0;
	current_autocluster = -1;
	resource_request_count = 0;
	resource_request_offers = 0;
}
示例#10
0
void initTypes( int max_types, StringList **type_strings, int except )
{
	int i;
	char* tmp;
	MyString buf;

	_checkInvalidParam("SLOT_TYPE_0", except);
		// CRUFT
	_checkInvalidParam("VIRTUAL_MACHINE_TYPE_0", except);

	if (! type_strings[0]) {
			// Type 0 is the special type for evenly divided slots.
		type_strings[0] = new StringList();
		type_strings[0]->initializeFromString("auto");
	}

	for( i=1; i < max_types; i++ ) {
		if( type_strings[i] ) {
			continue;
		}
		buf.formatstr("SLOT_TYPE_%d", i);
		tmp = param(buf.Value());
		if (!tmp) {
			if (param_boolean("ALLOW_VM_CRUFT", false)) {
				buf.formatstr("VIRTUAL_MACHINE_TYPE_%d", i);
				if (!(tmp = param(buf.Value()))) {
					continue;
				}
			} else {
				continue;
			}
		}
		type_strings[i] = new StringList();
		type_strings[i]->initializeFromString( tmp );
		free( tmp );
	}
}
示例#11
0
VMType::VMType(const char* prog_for_script, const char* scriptname, const char* workingpath, ClassAd *ad)
{
	m_vm_id = vmgahp->getNewVMId();
	m_prog_for_script = prog_for_script;
	m_scriptname = scriptname;
	m_workingpath = workingpath;
	m_classAd = *ad;
	m_vm_pid = 0;
	m_vm_mem = 0;
	m_vm_networking = false;
	m_vm_checkpoint = false;
	m_vm_no_output_vm = false; 
	m_vm_hardware_vt = false; 
	m_is_soft_suspended = false;
	m_self_shutdown = false;
	m_is_checkpointed = false;
	m_status = VM_STOPPED;
	m_cpu_time = 0;
	m_delete_working_files = false;
	m_vcpus = 1;

	if ( privsep_enabled() ) {
		m_file_owner = PRIV_CONDOR;
	} else {
		m_file_owner = PRIV_USER;
	}

	vmprintf(D_FULLDEBUG, "Constructed VM_Type.\n");

	// Use the script program to create a configuration file for VM ?
	m_use_script_to_create_config = 
		param_boolean("USE_SCRIPT_TO_CREATE_CONFIG", false);

	// Create initially transfered file list
	createInitialFileList();

}
示例#12
0
int
LogSetAttribute::ReadBody(FILE* fp)
{
	int rval, rval1;

	free(key);
	rval1 = readword(fp, key);
	if (rval1 < 0) {
		return rval1;
	}

	free(name);
	rval = readword(fp, name);
	if (rval < 0) {
		return rval;
	}
	rval1 += rval;

	free(value);
	rval = readline(fp, value);
	if (rval < 0) {
		return rval;
	}

    if (value_expr) delete value_expr;
    value_expr = NULL;
    if (ParseClassAdRvalExpr(value, value_expr)) {
        if (value_expr) delete value_expr;
        value_expr = NULL;
        if (param_boolean("CLASSAD_LOG_STRICT_PARSING", true)) {
            return -1;
        } else {
            dprintf(D_ALWAYS, "WARNING: strict classad parsing failed for expression: \"%s\"\n", value);
        }
    }
	return rval + rval1;
}
示例#13
0
    CondorLockFile(boost::python::object file, LOCK_TYPE lock_type)
        : m_lock_type(lock_type)
    {
        int fd = -1;
        std::string name;
        if (py_hasattr(file, "name"))
        {
            name = boost::python::extract<std::string>(file.attr("name"));
        }
        if (py_hasattr(file, "fileno"))
        {
            fd = boost::python::extract<int>(file.attr("fileno")());
        }
        else
        {
            THROW_EX(TypeError, "LockFile must be used with a file object.");
        }

        // Which locking protocol to use (old/new) is left up to the caller; this replicates the
        // logic from src/condor_utils/read_user_log.cpp
        bool new_locking = param_boolean("CREATE_LOCKS_ON_LOCAL_DISK", true);
#if defined(WIN32)
        new_locking = false;
#endif
        if (new_locking && name.length())
        {
            m_file_lock = boost::shared_ptr<FileLock>(new FileLock(name.c_str(), true, false));
            if (!m_file_lock->initSucceeded() ) {
                m_file_lock = boost::shared_ptr<FileLock>(new FileLock(fd, NULL, name.c_str()));
            }
        }
        else
        {
            m_file_lock = boost::shared_ptr<FileLock>(new FileLock(fd, NULL, name.length() ? name.c_str() : NULL));
        }
    }
示例#14
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
	if (cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	char * mount_under_scratch = param("MOUNT_UNDER_SCRATCH");
	if (mount_under_scratch) {

		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);
			free(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			return FALSE;
		}
	}

	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	if (cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "MEMORY_LIMIT", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);
			} else {
				dprintf(D_ALWAYS, "Not setting memory soft limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory soft limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of MEMORY_LIMIT: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int slotWeight;
		if (MachineAd->LookupInteger(ATTR_SLOT_WEIGHT, slotWeight)) {
			climits.set_cpu_shares(slotWeight*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of SlotWeight in machine ClassAd; ignoring.\n");
		}
	}

#endif

	return retval;
}
示例#15
0
void
BaseShadow::terminateJob( update_style_t kind ) // has a default argument of US_NORMAL
{
	int reason;
	bool signaled;
	MyString str;

	if( ! jobAd ) {
		dprintf( D_ALWAYS, "In terminateJob() w/ NULL JobAd!" );
	}

	/* The first thing we do is record that we are in a termination pending
		state. */
	if (kind == US_NORMAL) {
		str.formatstr("%s = TRUE", ATTR_TERMINATION_PENDING);
		jobAd->Insert(str.Value());
	}

	if (kind == US_TERMINATE_PENDING) {
		// In this case, the job had already completed once and the
		// status had been saved to the job queue, however, for
		// some reason, the shadow didn't exit with a good value and
		// the job had been requeued. When this style of update
		// is used, it is a shortcut from the very birth of the shadow
		// to here, and so there will not be a remote resource or
		// anything like that set up. In this situation, we just
		// want to write the log event and mail the user and exit
		// with a good exit code so the schedd removes the job from
		// the queue. If for some reason the logging fails once again,
		// the process continues to repeat. 
		// This means at least once semantics for the termination event
		// and user email, but at no time should the job actually execute
		// again.

		int exited_by_signal = FALSE;
		int exit_signal = 0;
		int exit_code = 0;

		getJobAdExitedBySignal(jobAd, exited_by_signal);
		if (exited_by_signal == TRUE) {
			getJobAdExitSignal(jobAd, exit_signal);
		} else {
			getJobAdExitCode(jobAd, exit_code);
		}

		if (exited_by_signal == TRUE) {
			reason = JOB_COREDUMPED;
			str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, core_file_name);
			jobAd->Insert(str.Value());
		} else {
			reason = JOB_EXITED;
		}

		dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 		getCluster(), getProc(), 
	 		exited_by_signal? "killed by signal" : "exited with status",
	 		exited_by_signal ? exit_signal : exit_code );
		
			// write stuff to user log, but get values from jobad
		logTerminateEvent( reason, kind );

			// email the user, but get values from jobad
		emailTerminateEvent( reason, kind );

		DC_Exit( reason );
	}

	// the default path when kind == US_NORMAL

	// cleanup this shadow (kill starters, etc)
	cleanUp();

	reason = getExitReason();
	signaled = exitedBySignal();

	/* also store the corefilename into the jobad so we can recover this 
		during a termination pending scenario. */
	if( reason == JOB_COREDUMPED ) {
		str.formatstr("%s = \"%s\"", ATTR_JOB_CORE_FILENAME, getCoreName());
		jobAd->Insert(str.Value());
	}

    // Update final Job committed time
    int last_ckpt_time = 0;
    jobAd->LookupInteger(ATTR_LAST_CKPT_TIME, last_ckpt_time);
    int current_start_time = 0;
    jobAd->LookupInteger(ATTR_JOB_CURRENT_START_DATE, current_start_time);
    int int_value = (last_ckpt_time > current_start_time) ?
                        last_ckpt_time : current_start_time;

    if( int_value > 0 ) {
        int job_committed_time = 0;
        jobAd->LookupInteger(ATTR_JOB_COMMITTED_TIME, job_committed_time);
		int delta = (int)time(NULL) - int_value;
        job_committed_time += delta;
        jobAd->Assign(ATTR_JOB_COMMITTED_TIME, job_committed_time);

		float slot_weight = 1;
		jobAd->LookupFloat(ATTR_JOB_MACHINE_ATTR_SLOT_WEIGHT0, slot_weight);
		float slot_time = 0;
		jobAd->LookupFloat(ATTR_COMMITTED_SLOT_TIME, slot_time);
		slot_time += slot_weight * delta;
		jobAd->Assign(ATTR_COMMITTED_SLOT_TIME, slot_time);
    }

	CommitSuspensionTime(jobAd);

	// update the job ad in the queue with some important final
	// attributes so we know what happened to the job when using
	// condor_history...
    if (m_num_cleanup_retries < 1 &&
        param_boolean("SHADOW_TEST_JOB_CLEANUP_RETRY", false)) {
		dprintf( D_ALWAYS,
				 "Testing Failure to perform final update to job queue!\n");
		retryJobCleanup();
		return;
    }
	if( !updateJobInQueue(U_TERMINATE) ) {
		dprintf( D_ALWAYS, 
				 "Failed to perform final update to job queue!\n");
		retryJobCleanup();
		return;
	}

	// Let's maximize the effectiveness of that queue synchronization and
	// only record the job as done if the update to the queue was successful.
	// If any of these next operations fail and the shadow exits with an
	// exit code which causes the job to get requeued, it will be in the
	// "terminate pending" state marked by the ATTR_TERMINATION_PENDING
	// attribute.

	dprintf( D_ALWAYS, "Job %d.%d terminated: %s %d\n",
	 	getCluster(), getProc(), 
	 	signaled ? "killed by signal" : "exited with status",
	 	signaled ? exitSignal() : exitCode() );

	// write stuff to user log:
	logTerminateEvent( reason );

	// email the user
	emailTerminateEvent( reason );

	if( reason == JOB_EXITED && claimIsClosing() ) {
			// Startd not accepting any more jobs on this claim.
			// We do this here to avoid having to treat this case
			// identically to JOB_EXITED in the code leading up to
			// this point.
		dprintf(D_FULLDEBUG,"Startd is closing claim, so no more jobs can be run on it.\n");
		reason = JOB_EXITED_AND_CLAIM_CLOSING;
	}

	// try to get a new job for this shadow
	if( recycleShadow(reason) ) {
		// recycleShadow delete's this, so we must return immediately
		return;
	}

	// does not return.
	DC_Exit( reason );
}
示例#16
0
int
DCStartd::delegateX509Proxy( const char* proxy, time_t expiration_time, time_t *result_expiration_time )
{
	dprintf( D_FULLDEBUG, "Entering DCStartd::delegateX509Proxy()\n" );

	setCmdStr( "delegateX509Proxy" );

	if( ! claim_id ) {
		newError( CA_INVALID_REQUEST,
				  "DCStartd::delegateX509Proxy: Called with NULL claim_id" );
		return CONDOR_ERROR;
	}

		// if this claim is associated with a security session
	ClaimIdParser cidp(claim_id);

	//
	// 1) begin the DELEGATE_GSI_CRED_STARTD command
	//
	ReliSock* tmp = (ReliSock*)startCommand( DELEGATE_GSI_CRED_STARTD,
	                                         Stream::reli_sock,
	                                         20, NULL, NULL, false,
											 cidp.secSessionId() ); 
	if( ! tmp ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: Failed to send command DELEGATE_GSI_CRED_STARTD to the startd" );
		return CONDOR_ERROR;
	}

	//
	// 2) get reply from startd - OK means continue, NOT_OK means
	//    don't bother (the startd doesn't require a delegated
	//    proxy
	//
	tmp->decode();
	int reply;
	if( !tmp->code(reply) ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: failed to receive reply from startd (1)" );
		delete tmp;
		return CONDOR_ERROR;
	}
	if ( !tmp->end_of_message() ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: end of message error from startd (1)" );
		delete tmp;
		return CONDOR_ERROR;
	}
	if( reply == NOT_OK ) {
		delete tmp;
		return NOT_OK;
	}
		
	//
	// 3) send over the claim id and delegate (or copy) the given proxy
	//
	tmp->encode();
	int use_delegation =
		param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ? 1 : 0;
	if( !tmp->code( claim_id ) ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: Failed to send claim id to the startd" );
		delete tmp;
		return CONDOR_ERROR;
	}
	if ( !tmp->code( use_delegation ) ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: Failed to send use_delegation flag to the startd" );
		delete tmp;
		return CONDOR_ERROR;
	}
	int rv;
	filesize_t dont_care;
	if( use_delegation ) {
		rv = tmp->put_x509_delegation( &dont_care, proxy, expiration_time, result_expiration_time );
	}
	else {
		dprintf( D_FULLDEBUG,
		         "DELEGATE_JOB_GSI_CREDENTIALS is False; using direct copy\n");
		if( ! tmp->get_encryption() ) {
			newError( CA_COMMUNICATION_ERROR,
					  "DCStartd::delegateX509Proxy: Cannot copy: channel does not have encryption enabled" );
			delete tmp;
			return CONDOR_ERROR;
		}
		rv = tmp->put_file( &dont_care, proxy );
	}
	if( rv == -1 ) {
		newError( CA_FAILURE,
				  "DCStartd::delegateX509Proxy: Failed to delegate proxy" );
		delete tmp;
		return CONDOR_ERROR;
	}
	if ( !tmp->end_of_message() ) {
		newError( CA_FAILURE,
				  "DCStartd::delegateX509Proxy: end of message error to startd" );
		delete tmp;
		return CONDOR_ERROR;
	}

	// command successfully sent; now get the reply
	tmp->decode();
	if( !tmp->code(reply) ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: failed to receive reply from startd (2)" );
		delete tmp;
		return CONDOR_ERROR;
	}
	if ( !tmp->end_of_message() ) {
		newError( CA_COMMUNICATION_ERROR,
				  "DCStartd::delegateX509Proxy: end of message error from startd (2)" );
		delete tmp;
		return CONDOR_ERROR;
	}
	delete tmp;

	dprintf( D_FULLDEBUG,
	         "DCStartd::delegateX509Proxy: successfully sent command, reply is: %d\n",
	         reply );

	return reply;
}
示例#17
0
void
fillRequirements( ClassAd* req )
{
	MyString jic_req = "TARGET.";
	if (jobad_path) {
		jic_req += ATTR_HAS_JIC_LOCAL_STDIN;
	} else {
		jic_req += ATTR_HAS_JIC_LOCAL_CONFIG;
	}
	jic_req += "==TRUE";

	MyString require;
	require = ATTR_REQUIREMENTS;
	require += "=(";

	if (requirements) {
		require += requirements;
		require += ")&&(";
	}
	int slot_id = 0;
	if (name) {
		if (sscanf(name, "slot%d@", &slot_id) != 1) { 
			slot_id = 0;
		}
	}
	if (slot_id > 0) {
		require += "TARGET.";
		require += ATTR_SLOT_ID;
		require += "==";
		require += slot_id;
		require += ")&&(";
	}
	else if (param_boolean("ALLOW_VM_CRUFT", false)) {
		int vm_id = 0;
		if (name) {
			if (sscanf(name, "vm%d@", &vm_id) != 1) { 
				vm_id = 0;
			}
		}
		if (vm_id > 0) {
			require += "TARGET.";
			require += ATTR_VIRTUAL_MACHINE_ID;
			require += "==";
			require += vm_id;
			require += ")&&(";
		}
	}

	require += jic_req;
	require += ')';

	if( ! req->Insert(require.Value()) ) {
		fprintf( stderr, "ERROR: can't parse requirements '%s'\n",
				 requirements );
		exit( 1 );
	}

#if defined(ADD_TARGET_SCOPING)
		// The user may have entered some references to machine attributes
		// without explicitly specifying the TARGET scope.
	req->AddTargetRefs( TargetMachineAttrs );
#endif

}
示例#18
0
void
init_daemon_list()
{
	char	*daemon_name;
	StringList daemon_names, dc_daemon_names;

	daemons.ordered_daemon_names.clearAll();
	char* dc_daemon_list = param("DC_DAEMON_LIST");

	if( !dc_daemon_list ) {
		dc_daemon_names.initializeFromString(default_dc_daemon_list);
	}
	else {
		if ( *dc_daemon_list == '+' ) {
			MyString	dclist;
			dclist = default_dc_daemon_list;
			dclist += ", ";
			dclist += &dc_daemon_list[1];
			dc_daemon_names.initializeFromString( dclist.Value() );
		}
		else {
			dc_daemon_names.initializeFromString(dc_daemon_list);

			StringList default_list(default_dc_daemon_list);
			default_list.rewind();
			char *default_entry;
			int	missing = 0;
			while( (default_entry=default_list.next()) ) {
				if( !dc_daemon_names.contains_anycase(default_entry) ) {
					dprintf(D_ALWAYS,
							"WARNING: expected to find %s in"
							" DC_DAEMON_LIST, but it is not there.\n",
							default_entry );
					missing++;
				}
			}
			if ( missing ) {
				dprintf( D_ALWAYS,
						 "WARNING: "
						 "%d entries are missing from DC_DAEMON_LIST.  "
						 "Unless you know what you are doing, it "
						 "is best to leave DC_DAEMON_LIST undefined "
						 "so that the default settings are used, "
						 "or use the new 'DC_DAEMON_LIST = "
						 "+<list>' syntax.\n", missing );
			}
				
		}
		free(dc_daemon_list);
	}

		// Tolerate a trailing comma in the list
	dc_daemon_names.remove( "" );

	char* ha_list = param("MASTER_HA_LIST");
	if( ha_list ) {
			// Make MASTER_HA_LIST case insensitive by always converting
			// what we get to uppercase.
		StringList ha_names;
		ha_list = strupr( ha_list );
		ha_names.initializeFromString(ha_list);
			// Tolerate a trailing comma in the list
		ha_names.remove( "" );
		daemons.ordered_daemon_names.create_union( ha_names, false );

		ha_names.rewind();
		while( (daemon_name = ha_names.next()) ) {
			if(daemons.FindDaemon(daemon_name) == NULL) {
				if( dc_daemon_names.contains(daemon_name) ) {
					new class daemon(daemon_name, true, true );
				} else {
					new class daemon(daemon_name, false, true );
				}
			}
		}
	}

	char* daemon_list = param("DAEMON_LIST");
	if( daemon_list ) {
			// Make DAEMON_LIST case insensitive by always converting
			// what we get to uppercase.
		daemon_list = strupr( daemon_list );
		daemon_names.initializeFromString(daemon_list);
		free( daemon_list );
			// Tolerate a trailing comma in the list
		daemon_names.remove( "" );


			/*
			  Make sure that if COLLECTOR is in the list, put it at
			  the front...  unfortunately, our List template (what
			  StringList is defined in terms of) is amazingly broken
			  with regard to insert() and append().  :( insert()
			  usually means: insert *before* the current position.
			  however, if you're at the end, it inserts before the
			  last valid entry, instead of working like append() as
			  you'd expect.  OR, if you just called rewind() and
			  insert() (to insert at the begining, right?) it works
			  like append() and sticks it at the end!!  ARGH!!!  so,
			  we've got to call next() after rewind() so we really
			  insert it at the front of the list.  UGH!  EVIL!!!
			  Derek Wright <*****@*****.**> 2004-12-23
			*/
		if( daemon_names.contains("COLLECTOR") ) {
			daemon_names.deleteCurrent();
			daemon_names.rewind();
			daemon_names.next();
			daemon_names.insert( "COLLECTOR" );
		}

			// start shared_port first for a cleaner startup
		if( daemon_names.contains("SHARED_PORT") ) {
			daemon_names.deleteCurrent();
			daemon_names.rewind();
			daemon_names.next();
			daemon_names.insert( "SHARED_PORT" );
		}
		else if( SharedPortEndpoint::UseSharedPort() ) {
			if( param_boolean("AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST",true) ) {
				dprintf(D_ALWAYS,"Adding SHARED_PORT to DAEMON_LIST, because USE_SHARED_PORT=true (to disable this, set AUTO_INCLUDE_SHARED_PORT_IN_DAEMON_LIST=False)\n");
				daemon_names.rewind();
				daemon_names.next();
				daemon_names.insert( "SHARED_PORT" );
			}
		}
		daemons.ordered_daemon_names.create_union( daemon_names, false );

		daemon_names.rewind();
		while( (daemon_name = daemon_names.next()) ) {
			if(daemons.FindDaemon(daemon_name) == NULL) {
				if( dc_daemon_names.contains(daemon_name) ) {
					new class daemon(daemon_name);
				} else {
					new class daemon(daemon_name, false);
				}
			}
		}
	} else {
		daemons.ordered_daemon_names.create_union( dc_daemon_names, false );
		for(int i = 0; default_daemon_list[i]; i++) {
			new class daemon(default_daemon_list[i]);
		}
	}

}
示例#19
0
static bool nodns_enabled()
{
	return param_boolean("NO_DNS", false);
}
示例#20
0
void
main_init( int argc, char* argv[] )
{
    extern int runfor;
	char	**ptr;

	if ( argc > 3 ) {
		usage( argv[0] );
	}
	
	int argc_count = 1;
	for( ptr=argv+1, argc_count = 1; argc_count<argc && *ptr; ptr++,argc_count++) {
		if( ptr[0][0] != '-' ) {
			usage( argv[0] );
		}
		switch( ptr[0][1] ) {
		case 'n':
			ptr++;
			if( !(ptr && *ptr) ) {
				EXCEPT( "-n requires another argument" );
			}
			MasterName = build_valid_daemon_name( *ptr );
			dprintf( D_ALWAYS, "Using name: %s\n", MasterName );
			break;
		default:
			usage( argv[0] );
		}
	}

    if (runfor != 0) {
        // We will construct an environment variable that 
        // tells the daemon what time it will be shut down. 
        // We'll give it an absolute time, though runfor is a 
        // relative time. This means that we don't have to update
        // the time each time we restart the daemon.
		MyString runfor_env;
		runfor_env.formatstr("%s=%ld", EnvGetName(ENV_DAEMON_DEATHTIME),
						   time(NULL) + (runfor * 60));
		SetEnv(runfor_env.Value());
    }

	daemons.SetDefaultReaper();
	
		// Grab all parameters needed by the master.
	init_params();
		// param() for DAEMON_LIST and initialize our daemons object.
	init_daemon_list();
	if ( daemons.SetupControllers() < 0 ) {
		EXCEPT( "Daemon initialization failed" );
	}
		// Lookup the paths to all the daemons we now care about.
	daemons.InitParams();
		// Initialize our classad;
	init_classad();  
		// Initialize the master entry in the daemons data structure.
	daemons.InitMaster();
		// Make sure if PrivSep is on we're not running as root
	check_uid_for_privsep();
		// open up the windows firewall 
	init_firewall_exceptions();

#if defined(WANT_CONTRIB) && defined(WITH_MANAGEMENT)
#if defined(HAVE_DLOPEN)
	MasterPluginManager::Load();
#elif defined(WIN32)
	load_master_mgmt();
#endif
	MasterPluginManager::Initialize();
#endif

		// Register admin commands
	daemonCore->Register_Command( RESTART, "RESTART",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( RESTART_PEACEFUL, "RESTART_PEACEFUL",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMONS_OFF, "DAEMONS_OFF",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMONS_OFF_FAST, "DAEMONS_OFF_FAST",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMONS_OFF_PEACEFUL, "DAEMONS_OFF_PEACEFUL",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMONS_ON, "DAEMONS_ON",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( MASTER_OFF, "MASTER_OFF",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( MASTER_OFF_FAST, "MASTER_OFF_FAST",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMON_ON, "DAEMON_ON",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMON_OFF, "DAEMON_OFF",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMON_OFF_FAST, "DAEMON_OFF_FAST",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( DAEMON_OFF_PEACEFUL, "DAEMON_OFF_PEACEFUL",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( CHILD_ON, "CHILD_ON",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( CHILD_OFF, "CHILD_OFF",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( CHILD_OFF_FAST, "CHILD_OFF_FAST",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	daemonCore->Register_Command( SET_SHUTDOWN_PROGRAM, "SET_SHUTDOWN_PROGRAM",
								  (CommandHandler)admin_command_handler, 
								  "admin_command_handler", 0, ADMINISTRATOR );
	// Command handler for stashing the pool password
	daemonCore->Register_Command( STORE_POOL_CRED, "STORE_POOL_CRED",
								(CommandHandler)&store_pool_cred_handler,
								"store_pool_cred_handler", NULL, CONFIG_PERM,
								D_FULLDEBUG );
	/*
	daemonCore->Register_Command( START_AGENT, "START_AGENT",
					  (CommandHandler)admin_command_handler, 
					  "admin_command_handler", 0, ADMINISTRATOR );
	*/

	daemonCore->RegisterTimeSkipCallback(time_skip_handler,0);

	_EXCEPT_Cleanup = DoCleanup;

#if !defined(WIN32)
	if( !dprintf_to_term_check() && param_boolean( "USE_PROCESS_GROUPS", true ) ) {
			// If we're not connected to a terminal, start our own
			// process group, unless the config file says not to.
		setsid();
	}
#endif

	if( StartDaemons ) {
		daemons.StartAllDaemons();
	}
	daemons.StartTimers();
}
示例#21
0
UserProc::UserProc( STARTUP_INFO &s ) :
	cluster( s.cluster ),
	proc( s.proc ),
	m_a_out( NULL ),
	core_name( NULL ),
	uid( s.uid ),
	gid( s.gid ),
	v_pid( s.virt_pid ),
	pid( 0 ),
	job_class( s.job_class ),
	state( NEW ),
	user_time( 0 ),
	sys_time( 0 ),
	exit_status_valid( FALSE ),
	exit_status( 0 ),
	ckpt_wanted( s.ckpt_wanted ),
	soft_kill_sig( s.soft_kill_sig ),
	new_ckpt_created( FALSE ),
	ckpt_transferred( FALSE ),
	core_created( FALSE ),
	core_transferred( FALSE ),
	exit_requested( FALSE ),
	image_size( -1 ),
	guaranteed_user_time( 0 ),
	guaranteed_sys_time( 0 ),
	pids_suspended( -1 )
{
	MyString	buf;
	mode_t 	omask;

	cmd = new char [ strlen(s.cmd) + 1 ];
	strcpy( cmd, s.cmd );

	// Since we are adding to the argument list, we may need to deal
	// with platform-specific arg syntax in the user's args in order
	// to successfully merge them with the additional args.
	args.SetArgV1SyntaxToCurrentPlatform();

	MyString args_errors;
	if(!args.AppendArgsV1or2Raw(s.args_v1or2,&args_errors)) {
		EXCEPT("ERROR: Failed to parse arguments string: %s\n%s",
			   args_errors.Value(),s.args_v1or2);
	}

		// set up environment as an object
	MyString env_errors;
	if(!env_obj.MergeFromV1or2Raw( s.env_v1or2,&env_errors )) {
		EXCEPT("ERROR: Failed to parse environment string: %s\n%s",
			   env_errors.Value(),s.env_v1or2);
	}

		// add name of SMP slot (from startd) into environment
	setSlotEnv(&env_obj);

	/* Port regulation for user job */
	int low_port, high_port;

		// assume outgoing port range
	if (get_port_range(TRUE, &low_port, &high_port) == TRUE) {
		buf.formatstr( "_condor_LOWPORT=%d", low_port);
		env_obj.SetEnv(buf.Value());
		buf.formatstr( "_condor_HIGHPORT=%d", high_port);
		env_obj.SetEnv(buf.Value());
	}
	/* end - Port regulation for user job */

	if( param_boolean("BIND_ALL_INTERFACES", true) ) {
		buf.formatstr( "_condor_BIND_ALL_INTERFACES=TRUE" );
	} else {
		buf.formatstr( "_condor_BIND_ALL_INTERFACES=FALSE" );
	}
	env_obj.SetEnv(buf.Value());
	

		// Generate a directory where process can run and do its checkpointing
	omask = umask(0);
	buf.formatstr( "dir_%d", getpid() );
	local_dir = new char [ buf.Length() + 1 ];
	strcpy( local_dir, buf.Value() );
	if (privsep_enabled()) {
		// the Switchboard expects a full path to privsep_create_dir
		MyString local_dir_path;
		local_dir_path.formatstr("%s/%s", Execute, local_dir);
		if (!privsep_create_dir(get_condor_uid(), local_dir_path.Value())) {
			EXCEPT("privsep_create_dir failure");
		}
		if (chmod(local_dir_path.Value(), LOCAL_DIR_MODE) == -1) {
			EXCEPT("chmod failure after privsep_create_dir");
		}
	}
	else {
		if( mkdir(local_dir,LOCAL_DIR_MODE) < 0 ) {
			EXCEPT( "mkdir(%s,0%o)", local_dir, LOCAL_DIR_MODE );
		}
	}
	(void)umask(omask);

		// Now that we know what the local_dir is, put the path into
		// the environment so the job knows where it is
	MyString scratch_env;
	scratch_env.formatstr("CONDOR_SCRATCH_DIR=%s/%s",Execute,local_dir);
	env_obj.SetEnv(scratch_env.Value());

	buf.formatstr( "%s/condor_exec.%d.%d", local_dir, cluster, proc );
	cur_ckpt = new char [ buf.Length() + 1 ];
	strcpy( cur_ckpt, buf.Value() );

		// Find out if user wants checkpointing
#if defined(NO_CKPT)
	ckpt_wanted = FALSE;
	dprintf(D_ALWAYS,
			"This platform doesn't implement checkpointing yet\n"
	);
#else
	ckpt_wanted = s.ckpt_wanted;
#endif

	restart = s.is_restart;
	coredump_limit_exists = s.coredump_limit_exists;
	coredump_limit = s.coredump_limit;
}
示例#22
0
void init_firewall_exceptions() {
#ifdef WIN32

	bool add_exception;
	char *master_image_path, *schedd_image_path, *startd_image_path,
		 *dbmsd_image_path, *quill_image_path, *dagman_image_path, 
		 *negotiator_image_path, *collector_image_path, *starter_image_path,
		 *shadow_image_path, *gridmanager_image_path, *gahp_image_path,
		 *gahp_worker_image_path, *credd_image_path, 
		 *vmgahp_image_path, *kbdd_image_path, *hdfs_image_path, *bin_path;
	const char* dagman_exe = "condor_dagman.exe";

	WindowsFirewallHelper wfh;
	
	add_exception = param_boolean("ADD_WINDOWS_FIREWALL_EXCEPTION", true);

	if ( add_exception == false ) {
		dprintf(D_FULLDEBUG, "ADD_WINDOWS_FIREWALL_EXCEPTION is false, skipping\n");
		return;
	}

	// We use getExecPath() here instead of param() since it's
	// possible the the Windows Service Control Manager
	// (SCM) points to one location for the master (which
	// is exec'd), while MASTER points to something else
	// (and ignored).
	
	master_image_path = getExecPath();
	if ( !master_image_path ) {	
		dprintf(D_ALWAYS, 
				"WARNING: Failed to get condor_master image path.\n"
				"Condor will not be excepted from the Windows firewall.\n");
		return;
	}

	// We want to add exceptions for the SCHEDD and the STARTD
	// so that (1) shadows can accept incoming connections on their 
	// command port and (2) so starters can do the same.

	schedd_image_path = param("SCHEDD");
	startd_image_path = param("STARTD");

	// We to also add exceptions for Quill and DBMSD

	quill_image_path = param("QUILL");
	dbmsd_image_path = param("DBMSD");

	// And add exceptions for all the other daemons, since they very well
	// may need to open a listen port for mechanisms like CCB, or HTTPS
	negotiator_image_path = param("NEGOTIATOR");
	collector_image_path = param("COLLECTOR");
	starter_image_path = param("STARTER");
	shadow_image_path = param("SHADOW");
	gridmanager_image_path = param("GRIDMANAGER");
	gahp_image_path = param("CONDOR_GAHP");
	gahp_worker_image_path = param("CONDOR_GAHP_WORKER");
	credd_image_path = param("CREDD");
	kbdd_image_path = param("KBDD");
	hdfs_image_path = param("HDFS");
	vmgahp_image_path = param("VM_GAHP_SERVER");
	
	// We also want to add exceptions for the DAGMan we ship
	// with Condor:

	bin_path = param ( "BIN" );
	if ( bin_path ) {
		dagman_image_path = (char*) malloc (
			strlen ( bin_path ) + strlen ( dagman_exe ) + 2 );
		if ( dagman_image_path ) {
			sprintf ( dagman_image_path, "%s\\%s", bin_path, dagman_exe );
		}
		free ( bin_path );
	}

	// Insert the master
	if ( !SUCCEEDED(wfh.addTrusted(master_image_path)) ) {
		dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				master_image_path);
	}

	// Insert daemons needed on a central manager
	if ( (daemons.FindDaemon("NEGOTIATOR") != NULL) && negotiator_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(negotiator_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				negotiator_image_path);
		}
	}
	if ( (daemons.FindDaemon("COLLECTOR") != NULL) && collector_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(collector_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				collector_image_path);
		}
	}

	// Insert daemons needed on a submit node
	if ( (daemons.FindDaemon("SCHEDD") != NULL) && schedd_image_path ) {
		// put in schedd
		if ( !SUCCEEDED(wfh.addTrusted(schedd_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				schedd_image_path);
		}
		// put in shadow
		if ( shadow_image_path && !SUCCEEDED(wfh.addTrusted(shadow_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				shadow_image_path);
		}
		// put in gridmanager
		if ( gridmanager_image_path && !SUCCEEDED(wfh.addTrusted(gridmanager_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				gridmanager_image_path);
		}
		// put in condor gahp
		if ( gahp_image_path && !SUCCEEDED(wfh.addTrusted(gahp_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				gahp_image_path);
		}
		// put in condor worker gahp
		if ( gahp_worker_image_path && !SUCCEEDED(wfh.addTrusted(gahp_worker_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				gahp_worker_image_path);
		}
	}

	// Insert daemons needed on a execute node.
	// Note we include the starter and friends seperately, since the
	// starter could run on either execute or submit nodes (think 
	// local universe jobs).
	if ( (daemons.FindDaemon("STARTD") != NULL) && startd_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(startd_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				startd_image_path);
		}
		if ( !SUCCEEDED(wfh.addTrusted(kbdd_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				kbdd_image_path);
		}
	}

	if ( (daemons.FindDaemon("QUILL") != NULL) && quill_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(quill_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				quill_image_path);
		}
	}

	if ( (daemons.FindDaemon("DBMSD") != NULL) && dbmsd_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(dbmsd_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				dbmsd_image_path);
		}
	}

	if ( starter_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(starter_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				starter_image_path);
		}
	}

	if ( (daemons.FindDaemon("CREDD") != NULL) && credd_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(credd_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				credd_image_path);
		}
	}

	if ( (daemons.FindDaemon("HDFS") != NULL) && hdfs_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(hdfs_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n", hdfs_image_path);
		}
	}

	if ( vmgahp_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted(vmgahp_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to the "
				"windows firewall exception list.\n",
				vmgahp_image_path);
		}
	}

	if ( dagman_image_path ) {
		if ( !SUCCEEDED(wfh.addTrusted (dagman_image_path)) ) {
			dprintf(D_FULLDEBUG, "WinFirewall: unable to add %s to "
				"the windows firewall exception list.\n",
				dagman_image_path);
		}
	}

	if ( master_image_path ) { free(master_image_path); }
	if ( schedd_image_path ) { free(schedd_image_path); }
	if ( startd_image_path ) { free(startd_image_path); }
	if ( quill_image_path )  { free(quill_image_path); }
	if ( dbmsd_image_path )  { free(dbmsd_image_path); }
	if ( dagman_image_path ) { free(dagman_image_path); }
	if ( negotiator_image_path ) { free(negotiator_image_path); }
	if ( collector_image_path ) { free(collector_image_path); }
	if ( shadow_image_path ) { free(shadow_image_path); }
	if ( gridmanager_image_path ) { free(gridmanager_image_path); }
	if ( gahp_image_path ) { free(gahp_image_path); }	
	if ( credd_image_path ) { free(credd_image_path); }	
	if ( vmgahp_image_path ) { free(vmgahp_image_path); }
	if ( kbdd_image_path ) { free(kbdd_image_path); }
#endif
}
示例#23
0
static void
check_execute_dir_perms( char const *exec_path )
{
	struct stat st;
	if (stat(exec_path, &st) < 0) {
		EXCEPT( "stat exec path (%s), errno: %d (%s)", exec_path, errno,
				strerror( errno ) ); 
	}

	// in PrivSep mode, the EXECUTE directory must be trusted by
	// the PrivSep kernel. we can't determine this ourselves in general
	// (since the PrivSep Switchboard can be recompiled to trust
	// non-root users), so we'll have to be satisfied for now that we
	// could stat its path
	//
	if (privsep_enabled()) {
		return;
	}

	// the following logic sets up the new_mode variable, depending
	// on the execute dir's current perms. if new_mode is set non-zero,
	// it means we need to do a chmod
	//
	mode_t new_mode = 0;
#if defined(WIN32)
	mode_t desired_mode = _S_IREAD | _S_IWRITE;
	if ((st.st_mode & desired_mode) != desired_mode) {
		new_mode = st.st_mode | desired_mode;
	}
#else
	// we want to avoid having our execute directory world-writable
	// if possible. it's possible if the execute directory is owned
	// by condor and either:
	//   - we're not switching UIDs. in this case, job sandbox dirs
	//     will just be owned by the condor UID, so we just need
	//     owner-writability
	//   - there's no root squash on the execute dir (since then we
	//     can do a mkdir as the condor UID then a chown to the job
	//     owner UID)
	//
	// additionally, the GLEXEC_JOB feature requires world-writability
	// on the execute dir
	//
	bool glexec_job = param_boolean("GLEXEC_JOB", false);
	if ((st.st_uid == get_condor_uid()) &&
	    (!can_switch_ids() || not_root_squashed(exec_path)) &&
	    !glexec_job)
	{
		// do the chown unless the current mode is exactly 755
		//
		if ((st.st_mode & 07777) != 0755) {
			new_mode = 0755;
		}
	}
	else {
		// do the chown if the mode doesn't already include 1777
		//
		if ((st.st_mode & 01777) != 01777) {
			new_mode = 01777;
		}
		if (!glexec_job) {
			dprintf(D_ALWAYS,
			        "WARNING: %s root-squashed or not condor-owned: "
			            "requiring world-writability\n",
			        exec_path);
		}
	}
#endif
	// now do a chmod if needed
	//
	if (new_mode != 0) {
		dprintf(D_FULLDEBUG, "Changing permission on %s\n", exec_path);
		if (chmod(exec_path, new_mode) < 0) {
			EXCEPT( "chmod exec path (%s), errno: %d (%s)", exec_path,
					errno, strerror( errno ) );
		}
	}
}
示例#24
0
/* Mess up the in memory job ad with interesting statistics about suspensions */
void record_suspension_hack(unsigned int action)
{
	char tmp[256];
	int total_suspensions;
	int last_suspension_time;
	int cumulative_suspension_time;
	extern char *schedd;

	if (!JobAd)
	{
		EXCEPT("Suspension code: Non-existant JobAd");
	}

	switch(action)
	{
		case ULOG_JOB_SUSPENDED:
			/* Add to ad number of suspensions */
			JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions);
			total_suspensions++;
			sprintf(tmp, "%s = %d", ATTR_TOTAL_SUSPENSIONS, total_suspensions);
			JobAd->Insert(tmp);

			/* Add to ad the current suspension time */
			last_suspension_time = time(NULL);
			sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, 
				last_suspension_time);
			JobAd->Insert(tmp);
			break;
		case ULOG_JOB_UNSUSPENDED: {
			/* add in the time I spent suspended to a running total */
			JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME,
				cumulative_suspension_time);
			JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME,
				last_suspension_time);
			int delta = time(NULL) - last_suspension_time;
			cumulative_suspension_time += delta;
			sprintf(tmp, "%s = %d", ATTR_CUMULATIVE_SUSPENSION_TIME,
				cumulative_suspension_time);
			JobAd->Insert(tmp);

			int uncommitted_suspension_time = 0;
			JobAd->LookupInteger(ATTR_UNCOMMITTED_SUSPENSION_TIME,
								 uncommitted_suspension_time);
			uncommitted_suspension_time += delta;
			JobAd->Assign(ATTR_UNCOMMITTED_SUSPENSION_TIME,uncommitted_suspension_time);

			/* set the current suspension time to zero, meaning not suspended */
			last_suspension_time = 0;
			sprintf(tmp, "%s = %d", ATTR_LAST_SUSPENSION_TIME, 
				last_suspension_time);
			JobAd->Insert(tmp);
			break;
		}
		default:
			EXCEPT("record_suspension_hack(): Action event not recognized.");
			break;
	}

	/* Sanity output */
	JobAd->LookupInteger(ATTR_TOTAL_SUSPENSIONS, total_suspensions);
	dprintf(D_FULLDEBUG,"%s = %d\n", ATTR_TOTAL_SUSPENSIONS, total_suspensions);
	JobAd->LookupInteger(ATTR_LAST_SUSPENSION_TIME, last_suspension_time);
	dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_LAST_SUSPENSION_TIME,
		last_suspension_time);
	JobAd->LookupInteger(ATTR_CUMULATIVE_SUSPENSION_TIME, 
		cumulative_suspension_time);
	dprintf(D_FULLDEBUG, "%s = %d\n", ATTR_CUMULATIVE_SUSPENSION_TIME,
		cumulative_suspension_time);
	
	/* If we've been asked to perform real time updates of the suspension
		information, then connect to the queue and do it here. */
	if (param_boolean("REAL_TIME_JOB_SUSPEND_UPDATES", false))
	{
			dprintf( D_ALWAYS, "Updating suspension info to schedd.\n" );
			if (!ConnectQ(schedd, SHADOW_QMGMT_TIMEOUT)) {
				/* Since these attributes aren't updated periodically, if
					the schedd is busy and a resume event update is lost,
					the the job will be marked suspended when it really isn't.
					The new shadow eventually corrects this via a periodic
					update of various calssad attributes, but I 
					suspect it won't be corrected in the event of a
					bad connect here for this shadow. */
				dprintf( D_ALWAYS, 
					"Timeout connecting to schedd. Suspension update lost.\n");
				return;
			}

        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_TOTAL_SUSPENSIONS, total_suspensions);
        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_CUMULATIVE_SUSPENSION_TIME, cumulative_suspension_time);
        	SetAttributeInt(Proc->id.cluster, Proc->id.proc, 
	            ATTR_LAST_SUSPENSION_TIME, last_suspension_time);

			DisconnectQ(NULL);
	}
}
示例#25
0
void
GangliaD::initAndReconfig()
{
	std::string libname;
	std::string gmetric_path;
	param(libname,"GANGLIA_LIB");
	param(gmetric_path,"GANGLIA_GMETRIC");
	if( libname.empty() && gmetric_path.empty()) {
		std::string libpath;
		char const *libpath_param = "GANGLIA_LIB_PATH";
		if( __WORDSIZE == 64 ) {
			libpath_param = "GANGLIA_LIB64_PATH";
		}
		param(libpath,libpath_param);

		dprintf(D_FULLDEBUG,"Searching for libganglia in %s=%s\n",libpath_param,libpath.c_str());
		locateSharedLib(libpath,"libganglia",libname);

		gmetric_path = "gmetric";

		if( libname.empty() && gmetric_path.empty()) {
			EXCEPT("libganglia was not found via %s, and GANGLIA_LIB is not configured, "
				   "and GANGLIA_GMETRIC is not configured.  "
				   "Ensure that libganglia is installed in a location included in %s "
				   "or configure GANGLIA_LIB and/or GANGLIA_GMETRIC.",
				   libpath_param, libpath_param);
		}
	}
	m_ganglia_noop = false;
	if( libname == "NOOP" ) {
		dprintf(D_ALWAYS,"GANGLIA_LIB=NOOP, so we will go through the motions, but not actually interact with ganglia\n");
		m_ganglia_noop = true;
	}

	if( !m_ganglia_noop ) {
		bool gmetric_initialized = false;
		bool libganglia_initialized = false;
		if( !gmetric_path.empty() ) {
			dprintf(D_ALWAYS,"Testing %s\n",gmetric_path.c_str());
			if( ganglia_init_gmetric(gmetric_path.c_str()) ) {
				gmetric_initialized = true;
			}
		}
		if( !libname.empty() ) {
			if( libname == m_ganglia_libname ) {
				libganglia_initialized = true;
				dprintf(D_ALWAYS,"Already loaded libganglia %s\n",libname.c_str());
				// I have observed instabilities when reloading the library, so
				// it is best to not do that unless it is really necessary.
			}
			else {
				dprintf(D_ALWAYS,"Loading libganglia %s\n",libname.c_str());
				ganglia_config_destroy(&m_ganglia_context,&m_ganglia_config,&m_ganglia_channels);
				if( ganglia_load_library(libname.c_str()) ) {
					libganglia_initialized = true;
					m_ganglia_libname = libname;
				}
				else if( gmetric_initialized ) {
					dprintf(D_ALWAYS,"WARNING: failed to load %s, so gmetric (which is slower) will be used instead.\n",libname.c_str());
				}
			}
		}

		if( libganglia_initialized ) {
			dprintf(D_ALWAYS,"Will use libganglia to interact with ganglia.\n");
		}
		else if( gmetric_initialized ) {
			dprintf(D_ALWAYS,"Will use gmetric to interact with ganglia.\n");
		}
		else {
			EXCEPT("Neither gmetric nor libganglia were successfully initialized.  Aborting");
		}

		std::string ganglia_conf_location;
		param(ganglia_conf_location, "GANGLIA_CONFIG", "/etc/ganglia/gmond.conf");
		int fd;
		if ((fd = safe_open_wrapper_follow(ganglia_conf_location.c_str(), O_RDONLY)) < 0)
		{
			EXCEPT("Cannot open Ganglia configuration file GANGLIA_CONFIG=%s.", ganglia_conf_location.c_str());
			return;
		}
		close(fd);

		if( !ganglia_reconfig(ganglia_conf_location.c_str(),&m_ganglia_context,&m_ganglia_config,&m_ganglia_channels) ) {
			EXCEPT("Failed to configure ganglia library.");
		}
	}

	param(m_gstat_command, "GANGLIA_GSTAT_COMMAND");
    split_args(m_gstat_command.c_str(), &m_gstat_argv);

    m_send_data_for_all_hosts = param_boolean("GANGLIA_SEND_DATA_FOR_ALL_HOSTS", false);

	StatsD::initAndReconfig("GANGLIAD");

	// the interval we tell ganglia is the max time between updates
	m_tmax = m_stats_pub_interval*2;
	// the interval we tell ganglia is the lifetime of the metric
	if( m_tmax*3 < 86400 ) {
		m_dmax = 86400;
	}
	else {
		m_dmax = m_tmax*3;
	}
}
示例#26
0
/*ARGSUSED*/
int
main(int argc, char *argv[] )
{
	char	*tmp = NULL;
	int		reserved_swap, free_swap;
	char	*host = NULL, *cluster = NULL, *proc = NULL;
	char	*bogus_capability;
	int		i;

	set_mySubSystem( "SHADOW", SUBSYSTEM_TYPE_SHADOW );

	myDistro->Init( argc, argv );
	if( argc == 2 && strncasecmp(argv[1], "-cl", 3) == MATCH ) {
		printClassAd();
		exit( 0 );
	}

#if defined(SYSCALL_DEBUG)
	SyscallLabel = argv[0] + 7;
#endif

#if !defined(WIN32)
	install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN );
#endif

	if( argc > 1 ) {
		if( strcmp("-t",argv[1]) == MATCH ) {
			Termlog = 1;
			argv++;
			argc--;
		}
	}

	ShadowBDate = LastRestartTime = time(0);

	_EXCEPT_Cleanup = ExceptCleanup;

	MyPid = getpid();
	
	config();

	/* Start up with condor.condor privileges. */
	/*
	  we need to do this AFTER we call config() so that if CONDOR_IDS
	  is being defined in the config file, we'll get the right value
	*/ 
	set_condor_priv();

	if(Termlog)
		dprintf_set_tool_debug(get_mySubSystem()->getName(), 0);
	else
		dprintf_config( get_mySubSystem()->getName() );
	DebugId = whoami;

	// create a database connection object
	
	dprintf( D_ALWAYS, "******* Standard Shadow starting up *******\n" );
	dprintf( D_ALWAYS, "** %s\n", CondorVersion() );
	dprintf( D_ALWAYS, "** %s\n", CondorPlatform() );
	dprintf( D_ALWAYS, "*******************************************\n" );

	reserved_swap = param_integer("RESERVED_SWAP", 0);
	reserved_swap *= 1024; /* megabytes -> kb */

	bool use_sql_log = param_boolean("QUILL_USE_SQL_LOG", false);
    FILEObj = FILESQL::createInstance(use_sql_log);
	
	free_swap = sysapi_swap_space();

	dprintf( D_FULLDEBUG, "*** Reserved Swap = %d\n", reserved_swap );
	dprintf( D_FULLDEBUG, "*** Free Swap = %d\n", free_swap );
	if( reserved_swap && free_swap < reserved_swap ) {
		dprintf( D_ALWAYS, "Not enough reserved swap space\n" );
		if(FILEObj) {
		  delete FILEObj;
		}

		exit( JOB_NO_MEM );
	}

	dprintf(D_ALWAYS, "uid=%d, euid=%d, gid=%d, egid=%d\n",
		getuid(), geteuid(), getgid(), getegid());

    dprintf(D_FULLDEBUG, "argc = %d\n", argc);
    for(i = 0; i < argc; i++)
    {
        dprintf(D_FULLDEBUG, "argv[%d] = %s\n", i, argv[i]);
    }
	if( argc < 6 ) {
		usage();
	}

	if (param_boolean("SHADOW_DEBUG_WAIT", false, false)) {
		int debug_wait = 1;
		dprintf(D_ALWAYS,
				"SHADOW_DEBUG_WAIT is TRUE, waiting for debugger to attach to pid %d.\n", 
				(int)::getpid());
		while (debug_wait) {
			sleep(1);
		}
	}

	CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS);

	if( strcmp("-pipe",argv[1]) == 0 ) {
		bogus_capability = argv[2];
		cluster = argv[3];
		proc = argv[4];
		// read the big comment in the function for why this is here.
		RemoveNewShadowDroppings(cluster, proc);
		pipe_setup( cluster, proc, bogus_capability );
	} else {
		schedd = argv[1];
		host = argv[2];
		bogus_capability = argv[3];
		cluster = argv[4];
		proc = argv[5];
		if ( argc > 6 ) {
			IpcFile = argv[6];
			dprintf(D_FULLDEBUG,"Setting IpcFile to %s\n",IpcFile);
		} else {
			IpcFile = NULL;
		}
		// read the big comment in the function for why this is here.
		RemoveNewShadowDroppings(cluster, proc);
		regular_setup( host, cluster, proc );
	}
	scheddName = getenv( EnvGetName( ENV_SCHEDD_NAME ) );

#if 0
		/* Don't want to share log file lock between child and pnarent */
	(void)close( LockFd );
	LockFd = -1;
#endif

	// initialize the user log
	initializeUserLog();

	My_Filesystem_Domain = param( "FILESYSTEM_DOMAIN" ); 
	dprintf( D_ALWAYS, "My_Filesystem_Domain = \"%s\"\n", 
			 My_Filesystem_Domain );

	My_UID_Domain = param( "UID_DOMAIN" ); 
	dprintf( D_ALWAYS, "My_UID_Domain = \"%s\"\n", My_UID_Domain );

	UseAFS = param_boolean_crufty( "USE_AFS", false ) ? TRUE : FALSE;

	UseNFS = param_boolean_crufty( "USE_NFS", false ) ? TRUE : FALSE;

	// if job specifies a checkpoint server host, this overrides
	// the config file parameters
	tmp = NULL;
	if (JobAd->LookupString(ATTR_CKPT_SERVER, &tmp) == 1) {
		if (CkptServerHost) free(CkptServerHost);
		UseCkptServer = TRUE;
		CkptServerHost = strdup(tmp);
		StarterChoosesCkptServer = FALSE;
		free(tmp);
	} else {
		free(tmp);
		if (CkptServerHost) {
            free(CkptServerHost);
        }
		CkptServerHost = param( "CKPT_SERVER_HOST" );
		UseCkptServer = FALSE;
		if( CkptServerHost && param_boolean_crufty( "USE_CKPT_SERVER", true ) ) {
			UseCkptServer = TRUE;
		}

		StarterChoosesCkptServer =
			param_boolean_crufty("STARTER_CHOOSES_CKPT_SERVER", true) ? TRUE : FALSE;
	}

		// Initialize location of our checkpoint file.  If we stored it
		// on a checkpoint server then set LastCkptServer.  Otherwise,
		// LastCkptServer should be NULL to indicate that we should
		// look on the local disk for our checkpoint file.
	LastCkptServer = NULL;
	if (JobAd->LookupString(ATTR_LAST_CKPT_SERVER,
							&LastCkptServer) == 0) {
		free(LastCkptServer);
		LastCkptServer = NULL;
	}

	// LIGO
	if (param_boolean("ALWAYS_USE_LOCAL_CKPT_SERVER", false)) {
		if (LastCkptServer) {
			char *remoteHost = NULL;
			JobAd->LookupString(ATTR_REMOTE_HOST, &remoteHost);

			char *machineName = strrchr(remoteHost, '@');
			if (machineName == NULL) {
				machineName = remoteHost;
			} else {
				machineName++;
			}

			LastCkptServer = strdup(machineName);
			CkptServerHost = strdup(machineName);

			dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, forcing LastCkptServer to %s\n", LastCkptServer);
		} else {
			dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, but checkpoint is not on server, restoring file local file\n");
		}
	}

	MaxDiscardedRunTime = param_integer( "MAX_DISCARDED_RUN_TIME", 3600 );

	CompressPeriodicCkpt = param_boolean( "COMPRESS_PERIODIC_CKPT", false );

	PeriodicSync = param_boolean( "PERIODIC_MEMORY_SYNC", false );

	CompressVacateCkpt = param_boolean( "COMPRESS_VACATE_CKPT", false );

	SlowCkptSpeed = param_integer( "SLOW_CKPT_SPEED", 0 );

	// Install signal handlers such that all signals are blocked when inside
	// the handler.
	sigset_t fullset;
	sigfillset(&fullset);
	install_sig_handler_with_mask( SIGCHLD,&fullset, reaper );

		// SIGUSR1 is sent by the schedd when a job is removed with
		// condor_rm.
	install_sig_handler_with_mask( SIGUSR1, &fullset, handle_sigusr1 );

		// SIGQUIT is sent for a fast shutdow.
	install_sig_handler_with_mask( SIGQUIT, &fullset, handle_sigquit );

		// SIGTERM is sent for a graceful shutdow.
	install_sig_handler_with_mask( SIGTERM, &fullset, handle_sigterm );


	/* Here we block the async signals.  We do this mainly because on HPUX,
	 * XDR wigs out if it is interrupted by a signal.  We do it on all
	 * platforms because it seems like a safe idea.  We unblock the signals
	 * during before select(), which is where we spend most of our time. */
	block_signal(SIGCHLD);
	block_signal(SIGUSR1);      

	/* If the completed job had been committed to the job queue,
		but for some reason the shadow exited wierdly and the
		schedd is trying to run it again, then simply write
		the job termination events and send the email as if the job had
		just ended. */
	if (terminate_is_pending() == TRUE) {
		/* This function will exit()! */
		handle_terminate_pending();
	}

	HandleSyscalls();

	Wrapup();

	/* HACK! WHOOO!!!!! Throw the old shadow away already! */
	/* This will figure out whether or not the job should go on hold AFTER the
		job has exited for whatever reason, or if the job should be allowed
		to exit. It modifies ExitReason approriately for job holding, or, get
		this, just EXCEPTs if the jobs is supposed to go into idle state and
		not leave. :) */
	/* Small change by Todd : only check the static policy if the job really
	   exited of its own accord -- we don't want to even evaluate the static
	   policy if the job exited because it was preempted, for instance */
	if (check_static_policy && 
		(ExitReason == JOB_EXITED || ExitReason == JOB_KILLED 
		     	|| ExitReason == JOB_COREDUMPED)) 
	{
		static_policy();
	}
    if( My_UID_Domain ) {
        free( My_UID_Domain );
    }
    if( My_Filesystem_Domain ) {
        free( My_Filesystem_Domain );
    }
        if(FILEObj) {
                delete FILEObj;
        }

	dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
		ExitReason );
	exit( ExitReason );
}
示例#27
0
void
BaseShadow::baseInit( ClassAd *job_ad, const char* schedd_addr, const char *xfer_queue_contact_info )
{
	int pending = FALSE;

	if( ! job_ad ) {
		EXCEPT("baseInit() called with NULL job_ad!");
	}
	jobAd = job_ad;

	if (sendUpdatesToSchedd && ! is_valid_sinful(schedd_addr)) {
		EXCEPT("schedd_addr not specified with valid address");
	}
	scheddAddr = sendUpdatesToSchedd ? strdup( schedd_addr ) : strdup("noschedd");

	m_xfer_queue_contact_info = xfer_queue_contact_info;

	if ( !jobAd->LookupString(ATTR_OWNER, owner)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_OWNER);
	}

	if( !jobAd->LookupInteger(ATTR_CLUSTER_ID, cluster)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_CLUSTER_ID);
	}

	if( !jobAd->LookupInteger(ATTR_PROC_ID, proc)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_PROC_ID);
	}


		// Grab the GlobalJobId if we've got it.
	if( ! jobAd->LookupString(ATTR_GLOBAL_JOB_ID, &gjid) ) {
		gjid = NULL;
	}

	// grab the NT domain if we've got it
	jobAd->LookupString(ATTR_NT_DOMAIN, domain);
	if ( !jobAd->LookupString(ATTR_JOB_IWD, iwd)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_JOB_IWD);
	}

	if( !jobAd->LookupFloat(ATTR_BYTES_SENT, prev_run_bytes_sent) ) {
		prev_run_bytes_sent = 0;
	}
	if( !jobAd->LookupFloat(ATTR_BYTES_RECVD, prev_run_bytes_recvd) ) {
		prev_run_bytes_recvd = 0;
	}

		// construct the core file name we'd get if we had one.
	MyString tmp_name = iwd;
	tmp_name += DIR_DELIM_CHAR;
	tmp_name += "core.";
	tmp_name += cluster;
	tmp_name += '.';
	tmp_name += proc;
	core_file_name = strdup( tmp_name.Value() );

        // put the shadow's sinful string into the jobAd.  Helpful for
        // the mpi shadow, at least...and a good idea in general.
	MyString tmp_addr = ATTR_MY_ADDRESS;
	tmp_addr += "=\"";
	tmp_addr += daemonCore->InfoCommandSinfulString();
	tmp_addr += '"';
    if ( !jobAd->Insert( tmp_addr.Value() )) {
        EXCEPT( "Failed to insert %s!", ATTR_MY_ADDRESS );
    }

	DebugId = display_dprintf_header;
	
	config();

		// Make sure we've got enough swap space to run
	checkSwap();

	// handle system calls with Owner's privilege
// XXX this belong here?  We'll see...
	// Calling init_user_ids() while in user priv causes badness.
	// Make sure we're in another priv state.
	set_condor_priv();
	if ( !init_user_ids(owner.Value(), domain.Value())) {
		dprintf(D_ALWAYS, "init_user_ids() failed as user %s\n",owner.Value() );
		// uids.C will EXCEPT when we set_user_priv() now
		// so there's not much we can do at this point
		
#if ! defined(WIN32)
		if ( param_boolean( "SHADOW_RUN_UNKNOWN_USER_JOBS", false ) )
		{
			dprintf(D_ALWAYS, "trying init_user_ids() as user nobody\n" );
			
			owner="nobody";
			domain=NULL;
			if (!init_user_ids(owner.Value(), domain.Value()))
			{
				dprintf(D_ALWAYS, "init_user_ids() failed!\n");
			}
			else
			{
				jobAd->Assign( ATTR_JOB_RUNAS_OWNER, "FALSE" );
				m_RunAsNobody=true;
				dprintf(D_ALWAYS, "init_user_ids() now running as user nobody\n");
			}
		}
#endif

	}
	set_user_priv();
	daemonCore->Register_Priv_State( PRIV_USER );

	dumpClassad( "BaseShadow::baseInit()", this->jobAd, D_JOB );

		// initialize the UserPolicy object
	shadow_user_policy.init( jobAd, this );

		// setup an object to keep our job ad updated to the schedd's
		// permanent job queue.  this clears all the dirty bits on our
		// copy of the classad, so anything we touch after this will
		// be updated to the schedd when appropriate.

		// Unless we got a command line arg asking us not to
	if (sendUpdatesToSchedd) {
		// the usual case
		job_updater = new QmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	} else {
		job_updater = new NullQmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	}

		// init user log; hold on failure
		// NOTE: job_updater must be initialized _before_ initUserLog(),
		// in order to handle the case of the job going on hold as a
		// result of failure in initUserLog().
	initUserLog();

		// change directory; hold on failure
	if ( cdToIwd() == -1 ) {
		EXCEPT("Could not cd to initial working directory");
	}

		// check to see if this invocation of the shadow is just to write
		// a terminate event and exit since this job had been recorded as
		// pending termination, but somehow the job didn't leave the queue
		// and the schedd is trying to restart it again..
	if( jobAd->LookupInteger(ATTR_TERMINATION_PENDING, pending)) {
		if (pending == TRUE) {
			// If the classad of this job "thinks" that this job should be
			// finished already, let's enact that belief.
			// This function does not return.
			this->terminateJob(US_TERMINATE_PENDING);
		}
	}

		// If we need to claim the startd before activating the claim
	int wantClaiming = 0;
	jobAd->LookupBool(ATTR_CLAIM_STARTD, wantClaiming);
	if (wantClaiming) {
		MyString startdSinful;
		MyString claimid;

			// Pull startd addr and claimid out of the jobad
		jobAd->LookupString(ATTR_STARTD_IP_ADDR, startdSinful);
		jobAd->LookupString(ATTR_CLAIM_ID, claimid);

		dprintf(D_ALWAYS, "%s is true, trying to claim startd %s\n", ATTR_CLAIM_STARTD, startdSinful.Value());

		classy_counted_ptr<DCStartd> startd = new DCStartd("description", NULL, startdSinful.Value(), claimid.Value());
	
		classy_counted_ptr<DCMsgCallback> cb = 
			new DCMsgCallback((DCMsgCallback::CppFunction)&BaseShadow::startdClaimedCB,
			this, jobAd);
																 
			// this can't fail, will always call the callback
		startd->asyncRequestOpportunisticClaim(jobAd, 
											   "description", 
											   daemonCore->InfoCommandSinfulString(), 
											   1200 /*alive interval*/, 
											   20 /* net timeout*/, 
											   100 /*total timeout*/, 
											   cb);
	}
}
示例#28
0
void
init_params()
{
	char	*tmp;
	static	int	master_name_in_config = 0;

	if( ! master_name_in_config ) {
			// First time, or we know it's not in the config file. 
		if( ! MasterName ) {
				// Not set on command line
			tmp = param( "MASTER_NAME" );
			if( tmp ) {
				MasterName = build_valid_daemon_name( tmp );
				master_name_in_config = 1;
				free( tmp );
			} 
		}
	} else {
		delete [] MasterName;
		tmp = param( "MASTER_NAME" );
		MasterName = build_valid_daemon_name( tmp );
		free( tmp );
	}
	if( MasterName ) {
		dprintf( D_FULLDEBUG, "Using name: %s\n", MasterName );
	}
			
	if (!param_boolean_crufty("START_MASTER", true)) {
			dprintf( D_ALWAYS, "START_MASTER was set to FALSE, shutting down.\n" );
			StartDaemons = FALSE;
			main_shutdown_graceful();
	}

		
	StartDaemons = TRUE;
	if (!param_boolean_crufty("START_DAEMONS", true)) {
			dprintf( D_ALWAYS, 
					 "START_DAEMONS flag was set to FALSE.  Not starting daemons.\n" );
			StartDaemons = FALSE;
	} 
		// If we were sent the daemons_off command, don't forget that
		// here. 
	if( GotDaemonsOff ) {
		StartDaemons = FALSE;
	}

	PublishObituaries = param_boolean_crufty("PUBLISH_OBITUARIES", true) ? TRUE : FALSE;

	Lines = param_integer("OBITUARY_LOG_LENGTH",20);

	master_backoff_constant = param_integer( "MASTER_BACKOFF_CONSTANT", 9, 1 );

	master_backoff_ceiling = param_integer( "MASTER_BACKOFF_CEILING", 3600,1 );

	master_backoff_factor = param_double( "MASTER_BACKOFF_FACTOR", 2.0, 0 );
	if( master_backoff_factor <= 0.0 ) {
    	master_backoff_factor = 2.0;
    }
	
	master_recover_time = param_integer( "MASTER_RECOVER_FACTOR", 300, 1 );

	update_interval = param_integer( "MASTER_UPDATE_INTERVAL", 5 * MINUTE, 1 );

	check_new_exec_interval = param_integer( "MASTER_CHECK_NEW_EXEC_INTERVAL", 5*MINUTE );

	new_bin_delay = param_integer( "MASTER_NEW_BINARY_DELAY", 2*MINUTE, 1 );

	new_bin_restart_mode = GRACEFUL;
	char * restart_mode = param("MASTER_NEW_BINARY_RESTART");
	if (restart_mode) {
#if 1
		StopStateT mode = StringToStopState(restart_mode);
#else
		static const struct {
			const char * text;
			StopStateT   mode;
			} modes[] = {
				{ "GRACEFUL", GRACEFUL },
				{ "PEACEFUL", PEACEFUL },
				{ "NEVER", NONE }, { "NONE", NONE }, { "NO", NONE },
			//	{ "FAST", FAST },
			//	{ "KILL", KILL },
			};
		StopStateT mode = (StopStateT)-1; // prime with -1 so we can detect bad input.
		for (int ii = 0; ii < (int)COUNTOF(modes); ++ii) {
			if (MATCH == strcasecmp(restart_mode, modes[ii].text)) {
				mode = modes[ii].mode;
				break;
			}
		}
#endif
		if (mode == (StopStateT)-1)	{
			dprintf(D_ALWAYS, "%s is not a valid value for MASTER_NEW_BINARY_RESTART. using GRACEFUL\n", restart_mode);
		}
		if (mode >= 0 && mode <= NONE)
			new_bin_restart_mode = mode;
		free(restart_mode);
	}

	preen_interval = param_integer( "PREEN_INTERVAL", 24*HOUR, 0 );
	if(preen_interval == 0) {
		EXCEPT("PREEN_INTERVAL in the condor configuration is too low (0).  Please set it to an integer in the range 1 to %d (default %d).  To disable condor_preen entirely, comment out PREEN.", INT_MAX, 24*HOUR);

	}

	shutdown_fast_timeout = param_integer( "SHUTDOWN_FAST_TIMEOUT", 5*MINUTE, 1 );

	shutdown_graceful_timeout = param_integer( "SHUTDOWN_GRACEFUL_TIMEOUT", 30*MINUTE, 1 );

	AllowAdminCommands = param_boolean( "ALLOW_ADMIN_COMMANDS", true );

	if( FS_Preen ) {
		free( FS_Preen );
	}
	FS_Preen = param( "PREEN" );
}
示例#29
0
void
JobQueueDBManager::config(bool reconfig) 
{
	char *tmp;
	MyString sql_str;
	int bndcnt = 0;
	const char *data_arr[3];
	QuillAttrDataType   data_typ[3];

	if (param_boolean("QUILL_ENABLED", false) == false) {
		EXCEPT("Quill++ is currently disabled. Please set QUILL_ENABLED to "
			   "TRUE if you want this functionality and read the manual "
			   "about this feature since it requires other attributes to be "
			   "set properly.");
	}

		//bail out if no SPOOL variable is defined since its used to 
		//figure out the location of the job_queue.log file
	char *spool = param("SPOOL");
	if(!spool) {
		EXCEPT("No SPOOL variable found in config file\n");
	}
  
	jobQueueLogFile = (char *) malloc(_POSIX_PATH_MAX * sizeof(char));
	snprintf(jobQueueLogFile,_POSIX_PATH_MAX * sizeof(char), 
			 "%s/job_queue.log", spool);

		/*
		  Here we try to read the database parameters in config
		  the db ip address format is <ipaddress:port> 
		*/
	dt = getConfigDBType();

	jobQueueDBIpAddress = param("QUILL_DB_IP_ADDR");

	jobQueueDBName = param("QUILL_DB_NAME");

	jobQueueDBUser = param("QUILL_DB_USER");
  	
	jobQueueDBConn = getDBConnStr(jobQueueDBIpAddress,
								  jobQueueDBName,
								  jobQueueDBUser,
								  spool);
								  
	dprintf(D_ALWAYS, "Using Job Queue File %s\n", jobQueueLogFile);

	dprintf(D_ALWAYS, "Using Database Type = Postgres\n");
	dprintf(D_ALWAYS, "Using Database IpAddress = %s\n", 
			jobQueueDBIpAddress?jobQueueDBIpAddress:"");
	dprintf(D_ALWAYS, "Using Database Name = %s\n", 
			jobQueueDBName?jobQueueDBName:"");
	dprintf(D_ALWAYS, "Using Database User = %s\n", 
			jobQueueDBUser?jobQueueDBUser:"");

	if(spool) {
		free(spool);
		spool = NULL;
	}

		// this function is also called when condor_reconfig is issued
		// and so we dont want to recreate all essential objects
	if(!reconfig) {
		prober = new ClassAdLogProber();
		caLogParser = new ClassAdLogParser();

		switch (dt) {				
		case T_PGSQL:
			DBObj = new PGSQLDatabase(jobQueueDBConn);
			break;
		default:
			break;;
		}

		xactState = NOT_IN_XACT;

		QuillErrCode ret_st;

		ret_st = DBObj->connectDB();
		if (ret_st == QUILL_FAILURE) {
			displayErrorMsg("config: unable to connect to DB--- ERROR");
			EXCEPT("config: unable to connect to DB\n");
		}

			/* the following will also throw an exception if the schema 
			   version is not correct */
		DBObj->assertSchemaVersion();
		
		tmp = param( "SCHEDD_NAME" );
		if( tmp ) {
			scheddname = build_valid_daemon_name( tmp );
			dprintf(D_FULLDEBUG, "scheddname %s built from param value %s\n", 
					scheddname, tmp);
			free(tmp);

		} else {
			scheddname = default_daemon_name();
			dprintf(D_FULLDEBUG, "scheddname built from default daemon name: %s\n", scheddname);
		}

		{
				/* create an entry in jobqueuepollinginfo if this schedd is the 
				 * first time being logged to database
				 */
			sql_str.formatstr("INSERT INTO jobqueuepollinginfo (scheddname, last_file_mtime, last_file_size) SELECT '%s', 0, 0 FROM dummy_single_row_table WHERE NOT EXISTS (SELECT * FROM jobqueuepollinginfo WHERE scheddname = '%s')", scheddname, scheddname);
		
			ret_st = DBObj->execCommand(sql_str.Value());
			if (ret_st == QUILL_FAILURE) {
				dprintf(D_ALWAYS, "Insert JobQueuePollInfo --- ERROR [SQL] %s\n", 
						sql_str.Value());
			}			
		}

		{
				/* create an entry in currency table if this schedd is the first
				 * time being logged to database 
				 */
			sql_str.formatstr("INSERT INTO currencies (datasource) SELECT '%s' FROM dummy_single_row_table WHERE NOT EXISTS (SELECT * FROM currencies WHERE datasource = '%s')", scheddname, scheddname);

			ret_st = DBObj->execCommand(sql_str.Value());
			if (ret_st == QUILL_FAILURE) {
				dprintf(D_ALWAYS, "Insert Currency --- ERROR [SQL] %s\n", sql_str.Value());
			}
		}
		
		ret_st = DBObj->commitTransaction();
		if (ret_st == QUILL_FAILURE) {
			dprintf(D_ALWAYS, "Commit transaction failed in JobQueueDBManager::config\n");
		}

		if (param_boolean("QUILL_MAINTAIN_DB_CONN", true) == false) {
			ret_st = DBObj->disconnectDB();
			if (ret_st == QUILL_FAILURE) {
				dprintf(D_ALWAYS, "JobQueueDBManager:config: unable to disconnect database --- ERROR\n");
			}	
		}
	}

		//this function assumes that certain members have been initialized
		// (specifically prober and caLogParser) and so the order is important.
	setJobQueueFileName(jobQueueLogFile);
}
示例#30
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}

	// Increase the OOM score of this process; the child will inherit it.
	// This way, the job will be heavily preferred to be killed over a normal process.
	// OOM score is currently exponential - a score of 4 is a factor-16 increase in
	// the OOM score.
	setupOOMScore(4,800);
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
		/* Note on CONDOR_UNIVERSE_LOCAL - The cgroup setup code below
		 *  requires a unique name for the cgroup. It relies on
		 *  uniqueness of the MachineAd's Name
		 *  attribute. Unfortunately, in the local universe the
		 *  MachineAd (mach_ad elsewhere) is never populated, because
		 *  there is no machine. As a result the ASSERT on
		 *  starter_name fails. This means that the local universe
		 *  will not work on any machine that has BASE_CGROUP
		 *  configured. A potential workaround is to set
		 *  STARTER.BASE_CGROUP on any machine that is also running a
		 *  schedd, but that disables cgroup support from a
		 *  co-resident startd. Instead, I'm disabling cgroup support
		 *  from within the local universe until the intraction of
		 *  local universe and cgroups can be properly worked
		 *  out. -matt 7 nov '12
		 */
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		if (starter_name.size() == 0) {
			char buf[16];
			sprintf(buf, "%d", getpid());
			starter_name = buf;
		}
		//ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup_str += this->CgroupSuffix();
		
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	auto_free_ptr mount_under_scratch(param("MOUNT_UNDER_SCRATCH"));
	if (mount_under_scratch) {
		// try evaluating mount_under_scratch as a classad expression, if it is
		// an expression it must return a string. if it's not an expression, just
		// use it as a string (as we did before 8.3.6)
		classad::Value value;
		if (JobAd->EvaluateExpr(mount_under_scratch.ptr(), value)) {
			const char * pval = NULL;
			if (value.IsStringValue(pval)) {
				mount_under_scratch.set(strdup(pval));
			} else {
				// was an expression, but not a string, so report and error and fail.
				dprintf(D_ALWAYS | D_ERROR,
					"ERROR: MOUNT_UNDER_SCRATCH does not evaluate to a string, it is : %s\n",
					ClassAdValueToString(value));
				return FALSE;
			}
		}
	}

	// if execute dir is encrypted, add /tmp and /var/tmp to mount_under_scratch
	bool encrypt_execdir = false;
	JobAd->LookupBool(ATTR_ENCRYPT_EXECUTE_DIRECTORY,encrypt_execdir);
	if (encrypt_execdir || param_boolean_crufty("ENCRYPT_EXECUTE_DIRECTORY",false)) {
		// prepend /tmp, /var/tmp to whatever admin wanted. don't worry
		// if admin already listed /tmp etc - subdirs can appear twice
		// in this list because AddMapping() ok w/ duplicate entries
		MyString buf("/tmp,/var/tmp,");
		buf += mount_under_scratch.ptr();
		mount_under_scratch.set(buf.StrDup());
	}
	if (mount_under_scratch) {
		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							delete fs_remap;
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							delete fs_remap;
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						delete fs_remap;
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			delete fs_remap;
			return FALSE;
		}
		mount_under_scratch.clear();
	}

#if defined(LINUX)
	// On Linux kernel 2.6.24 and later, we can give each
	// job its own PID namespace
	if (param_boolean("USE_PID_NAMESPACES", false)) {
		if (!can_switch_ids()) {
			EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
				"call in Linux unless running as root.");
		}
		fi.want_pid_namespace = this->SupportsPIDNamespace();
		if (fi.want_pid_namespace) {
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			fs_remap->RemapProc();
		}

		// When PID Namespaces are enabled, need to run the job
		// under the condor_pid_ns_init program, so that signals
		// propagate through to the child.  

		// First tell the program where to log output status
		// via an environment variable
		if (param_boolean("USE_PID_NAMESPACE_INIT", true)) {
			Env env;
			MyString env_errors;
			MyString arg_errors;
			std::string filename;

			filename = Starter->GetWorkingDir();
			filename += "/.condor_pid_ns_status";
		
			env.MergeFrom(JobAd, &env_errors);
			env.SetEnv("_CONDOR_PID_NS_INIT_STATUS_FILENAME", filename);
			env.InsertEnvIntoClassAd(JobAd, &env_errors);

			Starter->jic->removeFromOutputFiles(condor_basename(filename.c_str()));
			this->m_pid_ns_status_filename = filename;
			
			// Now, set the job's CMD to the wrapper, and shift
			// over the arguments by one

			ArgList args;
			std::string cmd;

			JobAd->LookupString(ATTR_JOB_CMD, cmd);
			args.AppendArg(cmd);
			args.AppendArgsFromClassAd(JobAd, &arg_errors);
			args.InsertArgsIntoClassAd(JobAd, NULL, & arg_errors);
	
			std::string libexec;
			if( !param(libexec,"LIBEXEC") ) {
				dprintf(D_ALWAYS, "Cannot find LIBEXEC so can not run condor_pid_ns_init\n");
				return 0;
			}
			std::string c_p_n_i = libexec + "/condor_pid_ns_init";
			JobAd->Assign(ATTR_JOB_CMD, c_p_n_i);
		}
	}
	dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
#endif


	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	// See Note near setup of param(BASE_CGROUP)
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "CGROUP_MEMORY_LIMIT_POLICY", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				m_memory_limit = MemMb_big;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);

				// Note that ATTR_VIRTUAL_MEMORY on Linux
				// is sum of memory and swap, in Kilobytes

				int VMemKb;
				if (MachineAd->LookupInteger(ATTR_VIRTUAL_MEMORY, VMemKb)) {

					uint64_t memsw_limit = ((uint64_t)1024) * VMemKb;
					if (VMemKb > 0) {
						// we're not allowed to set memsw limit <
						// the hard memory limit.  If we haven't set the hard
						// memory limit, the default may be infinity.
						// So, if we've set soft, set hard limit to memsw - one page
						if (mem_is_soft) {
							uint64_t hard_limit = memsw_limit - 4096;
							climits.set_memory_limit_bytes(hard_limit, false);
						}
						climits.set_memsw_limit_bytes(memsw_limit);
					}
				} else {
					dprintf(D_ALWAYS, "Not setting virtual memory limit in cgroup because "
						"Virtual Memory attribute missing in machine ad.\n");
				}
			} else {
				dprintf(D_ALWAYS, "Not setting memory limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of CGROUP_MEMORY_LIMIT_POLICY: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int numCores = 1;
		if (MachineAd->LookupInteger(ATTR_CPUS, numCores)) {
			climits.set_cpu_shares(numCores*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of Cpus in machine ClassAd; ignoring.\n");
		}
		setupOOMEvent(cgroup);
	}

    m_statistics.Reconfig();

	// Now that the job is started, decrease the likelihood that the starter
	// is killed instead of the job itself.
	if (retval)
	{
		setupOOMScore(0,0);
	}

#endif

	return retval;
}