Example #1
0
ArgList *BoincJob::GetArgs()
{
	ArgList *args = new ArgList();
	MyString arg_errors;

	if( !args->AppendArgsFromClassAd( jobAd, &arg_errors ) ) {
		dprintf( D_ALWAYS, "(%d.%d) Failed to read job arguments: %s\n",
				 procID.cluster, procID.proc, arg_errors.Value());
	}

	return args;
}
Example #2
0
int DockerProc::StartJob() {
	std::string imageID;

	if( ! JobAd->LookupString( ATTR_DOCKER_IMAGE, imageID ) ) {
		dprintf( D_ALWAYS | D_FAILURE, "%s not defined in job ad, unable to start job.\n", ATTR_DOCKER_IMAGE );
		return FALSE;
	}

	std::string command;
	JobAd->LookupString( ATTR_JOB_CMD, command );
	dprintf( D_FULLDEBUG, "%s: '%s'\n", ATTR_JOB_CMD, command.c_str() );

	std::string sandboxPath = Starter->jic->jobRemoteIWD();

	//
	// This code is deliberately wrong, probably for backwards-compability.
	// (See the code in JICShadow::beginFileTransfer(), which assumes that
	// we transferred the executable if ATTR_TRANSFER_EXECUTABLE is unset.)
	// Rather than risk breaking anything by fixing condor_submit (which
	// does not set ATTR_TRANSFER_EXECUTABLE unless it's false) -- and
	// introducing a version dependency -- assume the executable was
	// transferred unless it was explicitly noted otherwise.
	//
	bool transferExecutable = true;
	JobAd->LookupBool( ATTR_TRANSFER_EXECUTABLE, transferExecutable );
	if( transferExecutable ) {
		command = sandboxPath + "/" + command;
	}

	ArgList args;
	args.SetArgV1SyntaxToCurrentPlatform();
	MyString argsError;
	if( ! args.AppendArgsFromClassAd( JobAd, & argsError ) ) {
		dprintf( D_ALWAYS | D_FAILURE, "Failed to read job arguments from job ad: '%s'.\n", argsError.c_str() );
		return FALSE;
	}

	Env job_env;
	MyString env_errors;
	if( !Starter->GetJobEnv(JobAd,&job_env,&env_errors) ) {
		dprintf( D_ALWAYS, "Aborting DockerProc::StartJob: %s\n", env_errors.Value());
		return 0;
	}

	// The GlobalJobID is unsuitable by virtue its octothorpes.  This
	// construction is informative, but could be made even less likely
	// to collide if it had a timestamp.
	formatstr( containerName, "HTCJob%d_%d_%s_PID%d",
		Starter->jic->jobCluster(),
		Starter->jic->jobProc(),
		Starter->getMySlotName().c_str(), // note: this can be "" for single slot machines.
		getpid() );


	//
	// Do I/O redirection (includes streaming).
	//

	int childFDs[3] = { -2, -2, -2 };
	{
	TemporaryPrivSentry sentry(PRIV_USER);
	// getStdFile() returns -1 on error.

	if( -1 == (childFDs[0] = openStdFile( SFT_IN, NULL, true, "Input file" )) ) {
		dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stdin.\n" );
		return FALSE;
	}
	if( -1 == (childFDs[1] = openStdFile( SFT_OUT, NULL, true, "Output file" )) ) {
		dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stdout.\n" );
		daemonCore->Close_FD( childFDs[0] );
		return FALSE;
	}
	if( -1 == (childFDs[2] = openStdFile( SFT_ERR, NULL, true, "Error file" )) ) {
		dprintf( D_ALWAYS | D_FAILURE, "DockerProc::StartJob(): failed to open stderr.\n" );
		daemonCore->Close_FD( childFDs[0] );
		daemonCore->Close_FD( childFDs[1] );
		return FALSE;
	}
	}

	  // Ulog the execute event
	Starter->jic->notifyJobPreSpawn();

	CondorError err;
	// DockerAPI::run() returns a PID from daemonCore->Create_Process(), which
	// makes it suitable for passing up into VanillaProc.  This combination
	// will trigger the reaper(s) when the container terminates.
	
	ClassAd *machineAd = Starter->jic->machClassAd();

	std::list<std::string> extras;
	buildExtraVolumes(extras);

	int rv = DockerAPI::run( *machineAd, containerName, imageID, command, args, job_env, sandboxPath, extras, JobPid, childFDs, err );
	if( rv < 0 ) {
		dprintf( D_ALWAYS | D_FAILURE, "DockerAPI::run( %s, %s, ... ) failed with return value %d\n", imageID.c_str(), command.c_str(), rv );
		return FALSE;
	}
	dprintf( D_FULLDEBUG, "DockerAPI::run() returned pid %d\n", JobPid );


	// TODO: Start a timer to poll for job usage updates.

	++num_pids; // Used by OsProc::PublishUpdateAd().
	return TRUE;
}
Example #3
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}

	// Increase the OOM score of this process; the child will inherit it.
	// This way, the job will be heavily preferred to be killed over a normal process.
	// OOM score is currently exponential - a score of 4 is a factor-16 increase in
	// the OOM score.
	setupOOMScore(4,800);
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
		/* Note on CONDOR_UNIVERSE_LOCAL - The cgroup setup code below
		 *  requires a unique name for the cgroup. It relies on
		 *  uniqueness of the MachineAd's Name
		 *  attribute. Unfortunately, in the local universe the
		 *  MachineAd (mach_ad elsewhere) is never populated, because
		 *  there is no machine. As a result the ASSERT on
		 *  starter_name fails. This means that the local universe
		 *  will not work on any machine that has BASE_CGROUP
		 *  configured. A potential workaround is to set
		 *  STARTER.BASE_CGROUP on any machine that is also running a
		 *  schedd, but that disables cgroup support from a
		 *  co-resident startd. Instead, I'm disabling cgroup support
		 *  from within the local universe until the intraction of
		 *  local universe and cgroups can be properly worked
		 *  out. -matt 7 nov '12
		 */
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		if (starter_name.size() == 0) {
			char buf[16];
			sprintf(buf, "%d", getpid());
			starter_name = buf;
		}
		//ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup_str += this->CgroupSuffix();
		
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	auto_free_ptr mount_under_scratch(param("MOUNT_UNDER_SCRATCH"));
	if (mount_under_scratch) {
		// try evaluating mount_under_scratch as a classad expression, if it is
		// an expression it must return a string. if it's not an expression, just
		// use it as a string (as we did before 8.3.6)
		classad::Value value;
		if (JobAd->EvaluateExpr(mount_under_scratch.ptr(), value)) {
			const char * pval = NULL;
			if (value.IsStringValue(pval)) {
				mount_under_scratch.set(strdup(pval));
			} else {
				// was an expression, but not a string, so report and error and fail.
				dprintf(D_ALWAYS | D_ERROR,
					"ERROR: MOUNT_UNDER_SCRATCH does not evaluate to a string, it is : %s\n",
					ClassAdValueToString(value));
				return FALSE;
			}
		}
	}

	// if execute dir is encrypted, add /tmp and /var/tmp to mount_under_scratch
	bool encrypt_execdir = false;
	JobAd->LookupBool(ATTR_ENCRYPT_EXECUTE_DIRECTORY,encrypt_execdir);
	if (encrypt_execdir || param_boolean_crufty("ENCRYPT_EXECUTE_DIRECTORY",false)) {
		// prepend /tmp, /var/tmp to whatever admin wanted. don't worry
		// if admin already listed /tmp etc - subdirs can appear twice
		// in this list because AddMapping() ok w/ duplicate entries
		MyString buf("/tmp,/var/tmp,");
		buf += mount_under_scratch.ptr();
		mount_under_scratch.set(buf.StrDup());
	}
	if (mount_under_scratch) {
		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							delete fs_remap;
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							delete fs_remap;
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						delete fs_remap;
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			delete fs_remap;
			return FALSE;
		}
		mount_under_scratch.clear();
	}

#if defined(LINUX)
	// On Linux kernel 2.6.24 and later, we can give each
	// job its own PID namespace
	if (param_boolean("USE_PID_NAMESPACES", false)) {
		if (!can_switch_ids()) {
			EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this "
				"call in Linux unless running as root.");
		}
		fi.want_pid_namespace = this->SupportsPIDNamespace();
		if (fi.want_pid_namespace) {
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			fs_remap->RemapProc();
		}

		// When PID Namespaces are enabled, need to run the job
		// under the condor_pid_ns_init program, so that signals
		// propagate through to the child.  

		// First tell the program where to log output status
		// via an environment variable
		if (param_boolean("USE_PID_NAMESPACE_INIT", true)) {
			Env env;
			MyString env_errors;
			MyString arg_errors;
			std::string filename;

			filename = Starter->GetWorkingDir();
			filename += "/.condor_pid_ns_status";
		
			env.MergeFrom(JobAd, &env_errors);
			env.SetEnv("_CONDOR_PID_NS_INIT_STATUS_FILENAME", filename);
			env.InsertEnvIntoClassAd(JobAd, &env_errors);

			Starter->jic->removeFromOutputFiles(condor_basename(filename.c_str()));
			this->m_pid_ns_status_filename = filename;
			
			// Now, set the job's CMD to the wrapper, and shift
			// over the arguments by one

			ArgList args;
			std::string cmd;

			JobAd->LookupString(ATTR_JOB_CMD, cmd);
			args.AppendArg(cmd);
			args.AppendArgsFromClassAd(JobAd, &arg_errors);
			args.InsertArgsIntoClassAd(JobAd, NULL, & arg_errors);
	
			std::string libexec;
			if( !param(libexec,"LIBEXEC") ) {
				dprintf(D_ALWAYS, "Cannot find LIBEXEC so can not run condor_pid_ns_init\n");
				return 0;
			}
			std::string c_p_n_i = libexec + "/condor_pid_ns_init";
			JobAd->Assign(ATTR_JOB_CMD, c_p_n_i);
		}
	}
	dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false");
#endif


	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	// See Note near setup of param(BASE_CGROUP)
	if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "CGROUP_MEMORY_LIMIT_POLICY", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				m_memory_limit = MemMb_big;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);

				// Note that ATTR_VIRTUAL_MEMORY on Linux
				// is sum of memory and swap, in Kilobytes

				int VMemKb;
				if (MachineAd->LookupInteger(ATTR_VIRTUAL_MEMORY, VMemKb)) {

					uint64_t memsw_limit = ((uint64_t)1024) * VMemKb;
					if (VMemKb > 0) {
						// we're not allowed to set memsw limit <
						// the hard memory limit.  If we haven't set the hard
						// memory limit, the default may be infinity.
						// So, if we've set soft, set hard limit to memsw - one page
						if (mem_is_soft) {
							uint64_t hard_limit = memsw_limit - 4096;
							climits.set_memory_limit_bytes(hard_limit, false);
						}
						climits.set_memsw_limit_bytes(memsw_limit);
					}
				} else {
					dprintf(D_ALWAYS, "Not setting virtual memory limit in cgroup because "
						"Virtual Memory attribute missing in machine ad.\n");
				}
			} else {
				dprintf(D_ALWAYS, "Not setting memory limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of CGROUP_MEMORY_LIMIT_POLICY: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int numCores = 1;
		if (MachineAd->LookupInteger(ATTR_CPUS, numCores)) {
			climits.set_cpu_shares(numCores*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of Cpus in machine ClassAd; ignoring.\n");
		}
		setupOOMEvent(cgroup);
	}

    m_statistics.Reconfig();

	// Now that the job is started, decrease the likelihood that the starter
	// is killed instead of the job itself.
	if (retval)
	{
		setupOOMScore(0,0);
	}

#endif

	return retval;
}
Example #4
0
int
VanillaProc::StartJob()
{
	dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n");

	// vanilla jobs, unlike standard jobs, are allowed to run 
	// shell scripts (or as is the case on NT, batch files).  so
	// edit the ad so we start up a shell, pass the executable as
	// an argument to the shell, if we are asked to run a .bat file.
#ifdef WIN32

	CHAR		interpreter[MAX_PATH+1],
				systemshell[MAX_PATH+1];    
	const char* jobtmp				= Starter->jic->origJobName();
	int			joblen				= strlen(jobtmp);
	const char	*extension			= joblen > 0 ? &(jobtmp[joblen-4]) : NULL;
	bool		binary_executable	= ( extension && 
										( MATCH == strcasecmp ( ".exe", extension ) || 
										  MATCH == strcasecmp ( ".com", extension ) ) ),
				java_universe		= ( CONDOR_UNIVERSE_JAVA == job_universe );
	ArgList		arguments;
	MyString	filename,
				jobname, 
				error;
	
	if ( extension && !java_universe && !binary_executable ) {

		/** since we do not actually know how long the extension of
			the file is, we'll need to hunt down the '.' in the path,
			if it exists */
		extension = strrchr ( jobtmp, '.' );

		if ( !extension ) {

			dprintf ( 
				D_ALWAYS, 
				"VanillaProc::StartJob(): Failed to extract "
				"the file's extension.\n" );

			/** don't fail here, since we want executables to run
				as usual.  That is, some condor jobs submit 
				executables that do not have the '.exe' extension,
				but are, nonetheless, executable binaries.  For
				instance, a submit script may contain:

				executable = executable$(OPSYS) */

		} else {

			/** pull out the path to the executable */
			if ( !JobAd->LookupString ( 
				ATTR_JOB_CMD, 
				jobname ) ) {
				
				/** fall back on Starter->jic->origJobName() */
				jobname = jobtmp;

			}

			/** If we transferred the job, it may have been
				renamed to condor_exec.exe even though it is
				not an executable. Here we rename it back to
				a the correct extension before it will run. */
			if ( MATCH == strcasecmp ( 
					CONDOR_EXEC, 
					condor_basename ( jobname.Value () ) ) ) {
				filename.formatstr ( "condor_exec%s", extension );
				if (rename(CONDOR_EXEC, filename.Value()) != 0) {
					dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: "
							"failed to rename executable from %s to %s\n", 
							CONDOR_EXEC, filename.Value() );
				}
			} else {
				filename = jobname;
			}
			
			/** Since we've renamed our executable, we need to
				update the job ad to reflect this change. */
			if ( !JobAd->Assign ( 
				ATTR_JOB_CMD, 
				filename ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"set new executable name.\n" );

				return FALSE;

			}

			/** We've moved the script to argv[1], so we need to 
				add	the remaining arguments to positions argv[2]..
				argv[/n/]. */
			if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) ||
				 !arguments.InsertArgsIntoClassAd ( JobAd, NULL, 
				&error ) ) {

				dprintf (
					D_ALWAYS,
					"VanillaProc::StartJob(): ERROR: failed to "
					"get arguments from job ad: %s\n",
					error.Value () );

				return FALSE;

			}

			/** Since we know already we don't want this file returned
				to us, we explicitly add it to an exception list which
				will stop the file transfer mechanism from considering
				it for transfer back to its submitter */
			Starter->jic->removeFromOutputFiles (
				filename.Value () );

		}
			
	}
#endif

	// set up a FamilyInfo structure to tell OsProc to register a family
	// with the ProcD in its call to DaemonCore::Create_Process
	//
	FamilyInfo fi;

	// take snapshots at no more than 15 seconds in between, by default
	//
	fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15);

	m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated();
	if( ThisProcRunsAlongsideMainProc() ) {
			// If we track a secondary proc's family tree (such as
			// sshd) using the same dedicated account as the job's
			// family tree, we could end up killing the job when we
			// clean up the secondary family.
		m_dedicated_account = NULL;
	}
	if (m_dedicated_account) {
			// using login-based family tracking
		fi.login = m_dedicated_account;
			// The following message is documented in the manual as the
			// way to tell whether the dedicated execution account
			// configuration is being used.
		dprintf(D_ALWAYS,
		        "Tracking process family by login \"%s\"\n",
		        fi.login);
	}

	FilesystemRemap * fs_remap = NULL;
#if defined(LINUX)
	// on Linux, we also have the ability to track processes via
	// a phony supplementary group ID
	//
	gid_t tracking_gid = 0;
	if (param_boolean("USE_GID_PROCESS_TRACKING", false)) {
		if (!can_switch_ids() &&
		    (Starter->condorPrivSepHelper() == NULL))
		{
			EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify "
			           "the group list of our children unless running as "
			           "root or using PrivSep");
		}
		fi.group_ptr = &tracking_gid;
	}
#endif

#if defined(HAVE_EXT_LIBCGROUP)
	// Determine the cgroup
	std::string cgroup_base;
	param(cgroup_base, "BASE_CGROUP", "");
	MyString cgroup_str;
	const char *cgroup = NULL;
	if (cgroup_base.length()) {
		MyString cgroup_uniq;
		std::string starter_name, execute_str;
		param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN");
			// Note: Starter is a global variable from os_proc.cpp
		Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name);
		ASSERT (starter_name.size());
		cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str());
		const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'};
		cgroup_uniq.replaceString(dir_delim, "_");
		cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR,
			cgroup_uniq.Value());
		cgroup = cgroup_str.Value();
		ASSERT (cgroup != NULL);
		fi.cgroup = cgroup;
		dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup);
	}

#endif

// The chroot stuff really only works on linux
#ifdef LINUX
	{
        // Have Condor manage a chroot
       std::string requested_chroot_name;
       JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name);
       const char * allowed_root_dirs = param("NAMED_CHROOT");
       if (requested_chroot_name.size()) {
               dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str());
               StringList chroot_list(allowed_root_dirs);
               chroot_list.rewind();
               const char * next_chroot;
               bool acceptable_chroot = false;
               std::string requested_chroot;
               while ( (next_chroot=chroot_list.next()) ) {
                       MyString chroot_spec(next_chroot);
                       chroot_spec.Tokenize();
                       const char * chroot_name = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       const char * next_dir = chroot_spec.GetNextToken("=", false);
                       if (chroot_name == NULL) {
                               dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value());
                       }
                       dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value());
                       if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) {
                               acceptable_chroot = true;
                               requested_chroot = next_dir;
                       }
               }
               // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit.
               // Is this the responsibility of Condor to check, or the sysadmin who set it up?
               if (!acceptable_chroot) {
                       return FALSE;
               }
               dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str());

               std::stringstream ss;
               std::stringstream ss2;
               ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid();
               std::string execute_dir = ss2.str();
               ss << requested_chroot << DIR_DELIM_CHAR << ss2.str();
               std::string full_dir_str = ss.str();
               if (is_trivial_rootdir(requested_chroot)) {
                   dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str());
               } else if (IsDirectory(execute_dir.c_str())) {
                       {
                           TemporaryPrivSentry sentry(PRIV_ROOT);
                           if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) {
                               dprintf( D_FAILURE|D_ALWAYS,
                                   "Failed to create sandbox directory in chroot (%s): %s\n",
                                   full_dir_str.c_str(),
                                   strerror(errno) );
                               return FALSE;
                           }
                           if (chown(full_dir_str.c_str(),
                                     get_user_uid(),
                                     get_user_gid()) == -1)
                           {
                               EXCEPT("chown error on %s: %s",
                                      full_dir_str.c_str(),
                                      strerror(errno));
                           }
                       }
                       if (!fs_remap) {
                               fs_remap = new FilesystemRemap();
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str());
                       if (fs_remap->AddMapping(execute_dir, full_dir_str)) {
                               // FilesystemRemap object prints out an error message for us.
                               return FALSE;
                       }
                       dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/");
                       std::string root_str("/");
                       if (fs_remap->AddMapping(requested_chroot, root_str)) {
                               return FALSE;
                       }
               } else {
                       dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str());
               }
       } else {
               dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n");
       }
	}
// End of chroot 
#endif


	// On Linux kernel 2.4.19 and later, we can give each job its
	// own FS mounts.
	char * mount_under_scratch = param("MOUNT_UNDER_SCRATCH");
	if (mount_under_scratch) {

		std::string working_dir = Starter->GetWorkingDir();

		if (IsDirectory(working_dir.c_str())) {
			StringList mount_list(mount_under_scratch);
			free(mount_under_scratch);

			mount_list.rewind();
			if (!fs_remap) {
				fs_remap = new FilesystemRemap();
			}
			char * next_dir;
			while ( (next_dir=mount_list.next()) ) {
				if (!*next_dir) {
					// empty string?
					mount_list.deleteCurrent();
					continue;
				}
				std::string next_dir_str(next_dir);
				// Gah, I wish I could throw an exception to clean up these nested if statements.
				if (IsDirectory(next_dir)) {
					char * full_dir = dirscat(working_dir, next_dir_str);
					if (full_dir) {
						std::string full_dir_str(full_dir);
						delete [] full_dir; full_dir = NULL;
						if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) {
							dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str());
							return FALSE;
						}
						dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str());
						if (fs_remap->AddMapping(full_dir_str, next_dir_str)) {
							// FilesystemRemap object prints out an error message for us.
							return FALSE;
						}
					} else {
						dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str());
						return FALSE;
					}
				} else {
					dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir);
				}
			}
		} else {
			dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str());
			return FALSE;
		}
	}

	// have OsProc start the job
	//
	int retval = OsProc::StartJob(&fi, fs_remap);

	if (fs_remap != NULL) {
		delete fs_remap;
	}

#if defined(HAVE_EXT_LIBCGROUP)

	// Set fairshare limits.  Note that retval == 1 indicates success, 0 is failure.
	if (cgroup && retval) {
		std::string mem_limit;
		param(mem_limit, "MEMORY_LIMIT", "soft");
		bool mem_is_soft = mem_limit == "soft";
		std::string cgroup_string = cgroup;
		CgroupLimits climits(cgroup_string);
		if (mem_is_soft || (mem_limit == "hard")) {
			ClassAd * MachineAd = Starter->jic->machClassAd();
			int MemMb;
			if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) {
				uint64_t MemMb_big = MemMb;
				climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft);
			} else {
				dprintf(D_ALWAYS, "Not setting memory soft limit in cgroup because "
					"Memory attribute missing in machine ad.\n");
			}
		} else if (mem_limit == "none") {
			dprintf(D_FULLDEBUG, "Not enforcing memory soft limit.\n");
		} else {
			dprintf(D_ALWAYS, "Invalid value of MEMORY_LIMIT: %s.  Ignoring.\n", mem_limit.c_str());
		}

		// Now, set the CPU shares
		ClassAd * MachineAd = Starter->jic->machClassAd();
		int slotWeight;
		if (MachineAd->LookupInteger(ATTR_SLOT_WEIGHT, slotWeight)) {
			climits.set_cpu_shares(slotWeight*100);
		} else {
			dprintf(D_FULLDEBUG, "Invalid value of SlotWeight in machine ClassAd; ignoring.\n");
		}
	}

#endif

	return retval;
}
Example #5
0
int
OsProc::StartJob(FamilyInfo* family_info, FilesystemRemap* fs_remap=NULL)
{
	int nice_inc = 0;
	bool has_wrapper = false;

	dprintf(D_FULLDEBUG,"in OsProc::StartJob()\n");

	if ( !JobAd ) {
		dprintf ( D_ALWAYS, "No JobAd in OsProc::StartJob()!\n" );
		return 0;
	}

	MyString JobName;
	if ( JobAd->LookupString( ATTR_JOB_CMD, JobName ) != 1 ) {
		dprintf( D_ALWAYS, "%s not found in JobAd.  Aborting StartJob.\n", 
				 ATTR_JOB_CMD );
		return 0;
	}

	const char* job_iwd = Starter->jic->jobRemoteIWD();
	dprintf( D_ALWAYS, "IWD: %s\n", job_iwd );

		// some operations below will require a PrivSepHelper if
		// PrivSep is enabled (if it's not, privsep_helper will be
		// NULL)
	PrivSepHelper* privsep_helper = Starter->privSepHelper();

		// // // // // // 
		// Arguments
		// // // // // // 

		// prepend the full path to this name so that we
		// don't have to rely on the PATH inside the
		// USER_JOB_WRAPPER or for exec().

    bool transfer_exe = false;
    if (!JobAd->LookupBool(ATTR_TRANSFER_EXECUTABLE, transfer_exe)) {
        transfer_exe = false;
    }

    bool preserve_rel = false;
    if (!JobAd->LookupBool(ATTR_PRESERVE_RELATIVE_EXECUTABLE, preserve_rel)) {
        preserve_rel = false;
    }

    bool relative_exe = is_relative_to_cwd(JobName.Value());

    if (relative_exe && preserve_rel && !transfer_exe) {
        dprintf(D_ALWAYS, "Preserving relative executable path: %s\n", JobName.Value());
    }
	else if ( strcmp(CONDOR_EXEC,JobName.Value()) == 0 ) {
		JobName.formatstr( "%s%c%s",
		                 Starter->GetWorkingDir(),
		                 DIR_DELIM_CHAR,
		                 CONDOR_EXEC );
    }
	else if (relative_exe && job_iwd && *job_iwd) {
		MyString full_name;
		full_name.formatstr("%s%c%s",
		                  job_iwd,
		                  DIR_DELIM_CHAR,
		                  JobName.Value());
		JobName = full_name;

	}

	if( Starter->isGridshell() ) {
			// if we're a gridshell, just try to chmod our job, since
			// globus probably transfered it for us and left it with
			// bad permissions...
		priv_state old_priv = set_user_priv();
		int retval = chmod( JobName.Value(), S_IRWXU | S_IRWXO | S_IRWXG );
		set_priv( old_priv );
		if( retval < 0 ) {
			dprintf ( D_ALWAYS, "Failed to chmod %s!\n", JobName.Value() );
			return 0;
		}
	} 

	ArgList args;

		// Since we may be adding to the argument list, we may need to deal
		// with platform-specific arg syntax in the user's args in order
		// to successfully merge them with the additional wrapper args.
	args.SetArgV1SyntaxToCurrentPlatform();

		// First, put "condor_exec" or whatever at the front of Args,
		// since that will become argv[0] of what we exec(), either
		// the wrapper or the actual job.

	if( !getArgv0() ) {
		args.AppendArg(JobName.Value());
	} else {
		args.AppendArg(getArgv0());
	}
	
		// Support USER_JOB_WRAPPER parameter...
	char *wrapper = NULL;
	if( (wrapper=param("USER_JOB_WRAPPER")) ) {

			// make certain this wrapper program exists and is executable
		if( access(wrapper,X_OK) < 0 ) {
			dprintf( D_ALWAYS, 
					 "Cannot find/execute USER_JOB_WRAPPER file %s\n",
					 wrapper );
			free( wrapper );
			return 0;
		}
		has_wrapper = true;
			// Now, we've got a valid wrapper.  We want that to become
			// "JobName" so we exec it directly, and we want to put
			// what was the JobName (with the full path) as the first
			// argument to the wrapper
		args.AppendArg(JobName.Value());
		JobName = wrapper;
		free(wrapper);
	}
	
		// Support USE_PARROT 
	bool use_parrot = false;
	if( JobAd->LookupBool( ATTR_USE_PARROT, use_parrot) ) {
			// Check for parrot executable
		char *parrot = NULL;
		if( (parrot=param("PARROT")) ) {
			if( access(parrot,X_OK) < 0 ) {
				dprintf( D_ALWAYS, "Unable to use parrot(Cannot find/execute "
					"at %s(%s)).\n", parrot, strerror(errno) );
				free( parrot );
				return 0;
			} else {
				args.AppendArg(JobName.Value());
				JobName = parrot;
				free( parrot );
			}
		} else {
			dprintf( D_ALWAYS, "Unable to use parrot(Undefined path in config"
			" file)" );
			return 0;
		}
	}

		// Either way, we now have to add the user-specified args as
		// the rest of the Args string.
	MyString args_error;
	if(!args.AppendArgsFromClassAd(JobAd,&args_error)) {
		dprintf(D_ALWAYS, "Failed to read job arguments from JobAd.  "
				"Aborting OsProc::StartJob: %s\n",args_error.Value());
		return 0;
	}

		// // // // // // 
		// Environment 
		// // // // // // 

		// Now, instantiate an Env object so we can manipulate the
		// environment as needed.
	Env job_env;

	MyString env_errors;
	if( !Starter->GetJobEnv(JobAd,&job_env,&env_errors) ) {
		dprintf( D_ALWAYS, "Aborting OSProc::StartJob: %s\n",
				 env_errors.Value());
		return 0;
	}


		// // // // // // 
		// Standard Files
		// // // // // // 

	// handle stdin, stdout, and stderr redirection
	int fds[3];
		// initialize these to -2 to mean they're not specified.
		// -1 will be treated as an error.
	fds[0] = -2; fds[1] = -2; fds[2] = -2;

		// in order to open these files we must have the user's privs:
	priv_state priv;
	priv = set_user_priv();

		// if we're in PrivSep mode, we won't necessarily be able to
		// open the files for the job. getStdFile will return us an
		// open FD in some situations, but otherwise will give us
		// a filename that we'll pass to the PrivSep Switchboard
		//
	bool stdin_ok;
	bool stdout_ok;
	bool stderr_ok;
	MyString privsep_stdin_name;
	MyString privsep_stdout_name;
	MyString privsep_stderr_name;
	if (privsep_helper != NULL) {
		stdin_ok = getStdFile(SFT_IN,
		                      NULL,
		                      true,
		                      "Input file",
		                      &fds[0],
		                      &privsep_stdin_name);
		stdout_ok = getStdFile(SFT_OUT,
		                       NULL,
		                       true,
		                       "Output file",
		                       &fds[1],
		                       &privsep_stdout_name);
		stderr_ok = getStdFile(SFT_ERR,
		                       NULL,
		                       true,
		                       "Error file",
		                       &fds[2],
		                       &privsep_stderr_name);
	}
	else {
		fds[0] = openStdFile( SFT_IN,
		                      NULL,
		                      true,
		                      "Input file");
		stdin_ok = (fds[0] != -1);
		fds[1] = openStdFile( SFT_OUT,
		                      NULL,
		                      true,
		                      "Output file");
		stdout_ok = (fds[1] != -1);
		fds[2] = openStdFile( SFT_ERR,
		                      NULL,
		                      true,
		                      "Error file");
		stderr_ok = (fds[2] != -1);
	}

	/* Bail out if we couldn't open the std files correctly */
	if( !stdin_ok || !stdout_ok || !stderr_ok ) {
		/* only close ones that had been opened correctly */
		for ( int i = 0; i <= 2; i++ ) {
			if ( fds[i] >= 0 ) {
				daemonCore->Close_FD ( fds[i] );
			}
		}
		dprintf(D_ALWAYS, "Failed to open some/all of the std files...\n");
		dprintf(D_ALWAYS, "Aborting OsProc::StartJob.\n");
		set_priv(priv); /* go back to original priv state before leaving */
		return 0;
	}

		// // // // // // 
		// Misc + Exec
		// // // // // // 

	if( !ThisProcRunsAlongsideMainProc() ) {
		Starter->jic->notifyJobPreSpawn();
	}

	// compute job's renice value by evaluating the machine's
	// JOB_RENICE_INCREMENT in the context of the job ad...

    char* ptmp = param( "JOB_RENICE_INCREMENT" );
	if( ptmp ) {
			// insert renice expr into our copy of the job ad
		MyString reniceAttr = "Renice = ";
		reniceAttr += ptmp;
		if( !JobAd->Insert( reniceAttr.Value() ) ) {
			dprintf( D_ALWAYS, "ERROR: failed to insert JOB_RENICE_INCREMENT "
				"into job ad, Aborting OsProc::StartJob...\n" );
			free( ptmp );
			return 0;
		}
			// evaluate
		if( JobAd->EvalInteger( "Renice", NULL, nice_inc ) ) {
			dprintf( D_ALWAYS, "Renice expr \"%s\" evaluated to %d\n",
					 ptmp, nice_inc );
		} else {
			dprintf( D_ALWAYS, "WARNING: job renice expr (\"%s\") doesn't "
					 "eval to int!  Using default of 10...\n", ptmp );
			nice_inc = 10;
		}

			// enforce valid ranges for nice_inc
		if( nice_inc < 0 ) {
			dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
					 "low: adjusted to 0\n", nice_inc );
			nice_inc = 0;
		}
		else if( nice_inc > 19 ) {
			dprintf( D_FULLDEBUG, "WARNING: job renice value (%d) is too "
					 "high: adjusted to 19\n", nice_inc );
			nice_inc = 19;
		}

		ASSERT( ptmp );
		free( ptmp );
		ptmp = NULL;
	} else {
			// if JOB_RENICE_INCREMENT is undefined, default to 0
		nice_inc = 0;
	}

		// in the below dprintfs, we want to skip past argv[0], which
		// is sometimes condor_exec, in the Args string. 

	MyString args_string;
	args.GetArgsStringForDisplay(&args_string, 1);
	if( has_wrapper ) { 
			// print out exactly what we're doing so folks can debug
			// it, if they need to.
		dprintf( D_ALWAYS, "Using wrapper %s to exec %s\n", JobName.Value(), 
				 args_string.Value() );
	} else {
		dprintf( D_ALWAYS, "About to exec %s %s\n", JobName.Value(),
				 args_string.Value() );
	}

		// Grab the full environment back out of the Env object 
	if(IsFulldebug(D_FULLDEBUG)) {
		MyString env_string;
		job_env.getDelimitedStringForDisplay(&env_string);
		dprintf(D_FULLDEBUG, "Env = %s\n", env_string.Value());
	}

	// Check to see if we need to start this process paused, and if
	// so, pass the right flag to DC::Create_Process().
	int job_opt_mask = DCJOBOPT_NO_CONDOR_ENV_INHERIT;
	if (!param_boolean("JOB_INHERITS_STARTER_ENVIRONMENT",false)) {
		job_opt_mask |= DCJOBOPT_NO_ENV_INHERIT;
	}
	int suspend_job_at_exec = 0;
	JobAd->LookupBool( ATTR_SUSPEND_JOB_AT_EXEC, suspend_job_at_exec);
	if( suspend_job_at_exec ) {
		dprintf( D_FULLDEBUG, "OsProc::StartJob(): "
				 "Job wants to be suspended at exec\n" );
		job_opt_mask |= DCJOBOPT_SUSPEND_ON_EXEC;
	}

	// If there is a requested coresize for this job, enforce it.
	// Convert negative and very large values to RLIM_INFINITY, meaning
	// no size limit.
	// RLIM_INFINITY is unsigned, but its value and type size vary.
	long long core_size_ad;
	size_t core_size;
	size_t *core_size_ptr = NULL;
#if !defined(WIN32)
	if ( JobAd->LookupInteger( ATTR_CORE_SIZE, core_size_ad ) ) {
		if ( core_size_ad < 0 || (unsigned long long)core_size_ad > RLIM_INFINITY ) {
			core_size = RLIM_INFINITY;
		} else {
			core_size = (size_t)core_size_ad;
		}
		core_size_ptr = &core_size;
	}
#endif // !defined(WIN32)

	long rlimit_as_hard_limit = 0;
	char *rlimit_expr = param("STARTER_RLIMIT_AS");
	if (rlimit_expr) {
		classad::ClassAdParser parser;

		classad::ExprTree *tree = parser.ParseExpression(rlimit_expr);
		if (tree) {
			classad::Value val;
			long long result;

			if (EvalExprTree(tree, Starter->jic->machClassAd(), JobAd, val) && 
				val.IsIntegerValue(result)) {
					result *= 1024 * 1024; // convert to megabytes
					rlimit_as_hard_limit = (long)result; // truncate for Create_Process
					if (result > rlimit_as_hard_limit) {
						// if truncation to long results in a change in the value, then
						// the requested limit must be > 2 GB and we are on a 32 bit platform
						// in that case, the requested limit is > than what the process can get anyway
						// so just don't set a limit.
						rlimit_as_hard_limit = 0;
					}
					if (rlimit_as_hard_limit > 0) {
						dprintf(D_ALWAYS, "Setting job's virtual memory rlimit to %ld megabytes\n", rlimit_as_hard_limit);
					}
			} else {
				dprintf(D_ALWAYS, "Can't evaluate STARTER_RLIMIT_AS expression %s\n", rlimit_expr);
			}
		} else {
			dprintf(D_ALWAYS, "Can't parse STARTER_RLIMIT_AS expression: %s\n", rlimit_expr);
		}
	}

	int *affinity_mask = makeCpuAffinityMask(Starter->getMySlotNumber());

#if defined ( WIN32 )
    owner_profile_.update ();
    /*************************************************************
    NOTE: We currently *ONLY* support loading slot-user profiles.
    This limitation will be addressed shortly, by allowing regular 
    users to load their registry hive - Ben [2008-09-31]
    **************************************************************/
    bool load_profile = false,
         run_as_owner = false;
    JobAd->LookupBool ( ATTR_JOB_LOAD_PROFILE, load_profile );
    JobAd->LookupBool ( ATTR_JOB_RUNAS_OWNER,  run_as_owner );
    if ( load_profile && !run_as_owner ) {
        if ( owner_profile_.load () ) {
            /* publish the users environment into that of the main 

            job's environment */
            if ( !owner_profile_.environment ( job_env ) ) {
                dprintf ( D_ALWAYS, "OsProc::StartJob(): Failed to "
                    "export owner's environment.\n" );
            }            
        } else {
            dprintf ( D_ALWAYS, "OsProc::StartJob(): Failed to load "
                "owner's profile.\n" );
        }
    }
#endif

		// While we are still in user priv, print out the username
#if defined(LINUX)
	if( Starter->glexecPrivSepHelper() ) {
			// TODO: if there is some way to figure out the final username,
			// print it out here or after starting the job.
		dprintf(D_ALWAYS,"Running job via glexec\n");
	}
#else
	if( false ) {
	}
#endif
	else {
		char const *username = NULL;
		char const *how = "";
		CondorPrivSepHelper* cpsh = Starter->condorPrivSepHelper();
		if( cpsh ) {
			username = cpsh->get_user_name();
			how = "via privsep switchboard ";
		}
		else {
			username = get_user_loginname();
		}
		if( !username ) {
			username = "******";
		}
		dprintf(D_ALWAYS,"Running job %sas user %s\n",how,username);
	}

	set_priv ( priv );

    // use this to return more detailed and reliable error message info
    // from create-process operation.
    MyString create_process_err_msg;

	if (privsep_helper != NULL) {
		const char* std_file_names[3] = {
			privsep_stdin_name.Value(),
			privsep_stdout_name.Value(),
			privsep_stderr_name.Value()
		};
		JobPid = privsep_helper->create_process(JobName.Value(),
		                                        args,
		                                        job_env,
		                                        job_iwd,
		                                        fds,
		                                        std_file_names,
		                                        nice_inc,
		                                        core_size_ptr,
		                                        1,
		                                        job_opt_mask,
		                                        family_info,
												affinity_mask,
												&create_process_err_msg);
	}
	else {
		JobPid = daemonCore->Create_Process( JobName.Value(),
		                                     args,
		                                     PRIV_USER_FINAL,
		                                     1,
		                                     FALSE,
		                                     FALSE,
		                                     &job_env,
		                                     job_iwd,
		                                     family_info,
		                                     NULL,
		                                     fds,
		                                     NULL,
		                                     nice_inc,
		                                     NULL,
		                                     job_opt_mask, 
		                                     core_size_ptr,
                                             affinity_mask,
											 NULL,
                                             &create_process_err_msg,
                                             fs_remap,
											 rlimit_as_hard_limit);
	}

	// Create_Process() saves the errno for us if it is an "interesting" error.
	int create_process_errno = errno;

    // errno is 0 in the privsep case.  This executes for the daemon core create-process logic
    if ((FALSE == JobPid) && (0 != create_process_errno)) {
        if (create_process_err_msg != "") create_process_err_msg += " ";
        MyString errbuf;
        errbuf.formatstr("(errno=%d: '%s')", create_process_errno, strerror(create_process_errno));
        create_process_err_msg += errbuf;
    }

	// now close the descriptors in fds array.  our child has inherited
	// them already, so we should close them so we do not leak descriptors.
	// NOTE, we want to use a special method to close the starter's
	// versions, if that's what we're using, so we don't think we've
	// still got those available in other parts of the code for any
	// reason.
	for ( int i = 0; i <= 2; i++ ) {
		if ( fds[i] >= 0 ) {
			daemonCore->Close_FD ( fds[i] );
		}
	}

	if ( JobPid == FALSE ) {
		JobPid = -1;

		if(!create_process_err_msg.IsEmpty()) {

			// if the reason Create_Process failed was that registering
			// a family with the ProcD failed, it is indicative of a
			// problem regarding this execute machine, not the job. in
			// this case, we'll want to EXCEPT instead of telling the
			// Shadow to put the job on hold. there are probably other
			// error conditions where EXCEPTing would be more appropriate
			// as well...
			//
			if (create_process_errno == DaemonCore::ERRNO_REGISTRATION_FAILED) {
				EXCEPT("Create_Process failed to register the job with the ProcD");
			}

			MyString err_msg = "Failed to execute '";
			err_msg += JobName;
			err_msg += "'";
			if(!args_string.IsEmpty()) {
				err_msg += " with arguments ";
				err_msg += args_string.Value();
			}
			err_msg += ": ";
			err_msg += create_process_err_msg;
			if( !ThisProcRunsAlongsideMainProc() ) {
				Starter->jic->notifyStarterError( err_msg.Value(),
			    	                              true,
			        	                          CONDOR_HOLD_CODE_FailedToCreateProcess,
			            	                      create_process_errno );
			}
		}

		dprintf(D_ALWAYS,"Create_Process(%s,%s, ...) failed: %s\n",
			JobName.Value(), args_string.Value(), create_process_err_msg.Value());
		return 0;
	}

	num_pids++;

	dprintf(D_ALWAYS,"Create_Process succeeded, pid=%d\n",JobPid);

	job_start_time.getTime();

	return 1;
}
void
doContactSchedd()
{
	if (command_queue.IsEmpty()) {
		daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min
		return;
	}

	dprintf(D_FULLDEBUG,"in doContactSchedd\n");

	SchedDRequest * current_command = NULL;

	int error=FALSE;
	std::string error_msg;
	CondorError errstack;
	bool do_reschedule = false;
	int failure_line_num = 0;
	int failure_errno = 0;

	// Try connecting to schedd
	DCSchedd dc_schedd ( ScheddAddr, ScheddPool );
	if (dc_schedd.error() || !dc_schedd.locate()) {
		sprintf( error_msg, "Error locating schedd %s", ScheddAddr );

		dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );

		// If you can't connect return "Failure" on every job request
		command_queue.Rewind();
		while (command_queue.Next(current_command)) {
			if (current_command->status != SchedDRequest::SDCS_NEW)
				continue;

			if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) {
				const char * result[] = {
					GAHP_RESULT_FAILURE,
					error_msg.c_str(),
					"0"};
				enqueue_result (current_command->request_id, result, 3);
			} else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) {
				const char * result[] = {
									GAHP_RESULT_FAILURE,
									NULL,
									error_msg.c_str() };
				enqueue_result (current_command->request_id, result, 3);
			} else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) {
				const char * result[] = {
									GAHP_RESULT_FAILURE,
									error_msg.c_str(),
									NULL };
				enqueue_result (current_command->request_id, result, 3);
			} else {
				const char * result[] = {
									GAHP_RESULT_FAILURE,
									error_msg.c_str() };
				enqueue_result (current_command->request_id, result, 2);
			}

			current_command->status = SchedDRequest::SDCS_COMPLETED;
		}
	}

	
	SchedDRequest::schedd_command_type commands [] = {
		SchedDRequest::SDC_REMOVE_JOB,
		SchedDRequest::SDC_HOLD_JOB,
		SchedDRequest::SDC_RELEASE_JOB };

	const char * command_titles [] = {
		"REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" };

	// REMOVE
	// HOLD
	// RELEASE
	int i=0;
	while (i<3) {
		
		
		StringList id_list;
		SimpleList <SchedDRequest*> this_batch;

		SchedDRequest::schedd_command_type this_command = commands[i];
		const char * this_action = command_titles[i];
		const char * this_reason = NULL;

		dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action);
		
		error = FALSE;

		// Create a batch of commands with the same command type AND the same reason		
		command_queue.Rewind();
		while (command_queue.Next(current_command)) {
			if (current_command->status != SchedDRequest::SDCS_NEW)
				continue;

			if (current_command->command != this_command)
				continue;

			if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0))
				continue;

			if (this_reason == NULL)
				this_reason = current_command->reason;
				
			char job_id_buff[30];
			sprintf (job_id_buff, "%d.%d",
				current_command->cluster_id,
				current_command->proc_id);
			id_list.append (job_id_buff);

			this_batch.Append (current_command);
		}

		// If we haven't found any....
		if (id_list.isEmpty()) {
			i++;
			continue;	// ... then try the next command
		}

		// Perform the appropriate command on the current batch
		ClassAd * result_ad= NULL;
		if (this_command == SchedDRequest::SDC_REMOVE_JOB)  {
			errstack.clear();
			result_ad=
				dc_schedd.removeJobs (
					&id_list,
					this_reason,
					&errstack);
		} else if (this_command == SchedDRequest::SDC_HOLD_JOB) {
			errstack.clear();
			result_ad=
				dc_schedd.holdJobs (
					&id_list,
					this_reason,
					NULL,
			 		&errstack);
		} else if (this_command == SchedDRequest::SDC_RELEASE_JOB)  {
			errstack.clear();
			result_ad=
				dc_schedd.releaseJobs (
					&id_list,
					this_reason,
					&errstack);
		} else {
			EXCEPT( "Unexpected command type %d in doContactSchedd",
					this_command );
		}

		// Analyze the result ad
		if (!result_ad) {
			error = TRUE;
			sprintf( error_msg, "Error connecting to schedd %s %s: %s",
					 ScheddAddr, dc_schedd.addr(), errstack.getFullText() );
		}
		else {
			result_ad->dPrint (D_FULLDEBUG);
			if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) {
				do_reschedule = true;
			}
		}

		// Go through the batch again, and create responses for each request
		this_batch.Rewind();
		while (this_batch.Next(current_command)) {
			
			// Check the result
			char job_id_buff[30];
			if (result_ad && (error == FALSE)) {
				sprintf (job_id_buff, "job_%d_%d",
					current_command->cluster_id,
					current_command->proc_id);
				
				int remove_result;
				if (result_ad->LookupInteger (job_id_buff, remove_result)) {
					switch (remove_result) {
						case AR_ERROR:
							error = TRUE;
							error_msg = "General Error";
							break;
						case AR_SUCCESS:
							error = FALSE;
							break;
						case AR_NOT_FOUND:
							error = TRUE;
							error_msg = "Job not found";
							break;
						case AR_BAD_STATUS:
							error = TRUE;
							error_msg = "Bad job status";
							break;
						case AR_ALREADY_DONE:
							error = TRUE;
							error_msg = "Already done";
							break;
						case AR_PERMISSION_DENIED:
							error = TRUE;
							error_msg = "Permission denied";
							break;
						default:
							error = TRUE;
							error_msg = "Unknown Result";
					} // hctiws

				} else {
					error_msg = "Unable to get result";
				} // fi lookup result for job
			} // fi error == FALSE

			if (error) {
				dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n",
						this_action,
						current_command->cluster_id,
						current_command->proc_id,
						error_msg.c_str());

				const char * result[2];
				result[0] = GAHP_RESULT_FAILURE;
				result[1] = error_msg.c_str();

				enqueue_result (current_command->request_id, result, 2);
			} else {
				dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n",
						this_action,
						current_command->cluster_id,
						current_command->proc_id);

				const char * result[2];
				result[0] = GAHP_RESULT_SUCCESS;
				result[1] = NULL;

				enqueue_result (current_command->request_id, result, 2);
			} // fi error

			// Mark the status
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		} // elihw this_batch

		if ( result_ad ) {
			delete result_ad;
		}
	}

	dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n");
	

	// JOB_STAGE_IN
	int MAX_BATCH_SIZE=1; // This should be a config param

	SimpleList <SchedDRequest*> stage_in_batch;
	do {
		stage_in_batch.Clear();

		command_queue.Rewind();
		while (command_queue.Next(current_command)) {

			if (current_command->status != SchedDRequest::SDCS_NEW)
				continue;

			if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN)
				continue;

			dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", 
					 current_command->cluster_id,
					 current_command->proc_id);

			stage_in_batch.Append (current_command);
			if (stage_in_batch.Number() >= MAX_BATCH_SIZE)
				break;
		}

		if (stage_in_batch.Number() > 0) {
			ClassAd ** array = new ClassAd*[stage_in_batch.Number()];
			i=0;
			stage_in_batch.Rewind();
			while (stage_in_batch.Next(current_command)) {
				array[i++] = current_command->classad;
			}

			error = FALSE;
			errstack.clear();
			if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(),
										  array,
										  &errstack )) {
				error = TRUE;
				sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() );
				dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
			}
			delete [] array;
  
			stage_in_batch.Rewind();
			while (stage_in_batch.Next(current_command)) {
				current_command->status = SchedDRequest::SDCS_COMPLETED;

				if (error) {
					const char * result[] = {
						GAHP_RESULT_FAILURE,
						error_msg.c_str() };
					enqueue_result (current_command->request_id, result, 2);

				} else {
					const char * result[] = {
						GAHP_RESULT_SUCCESS,
						NULL };
					enqueue_result (current_command->request_id, result, 2);
				}
			} // elihw (command_queue)
		} // fi has STAGE_IN requests
	} while (stage_in_batch.Number() > 0);

	dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n");
	

	// JOB_STAGE_OUT
	SimpleList <SchedDRequest*> stage_out_batch;

	command_queue.Rewind();
	while (command_queue.Next(current_command)) {

		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT)
			continue;


		stage_out_batch.Append (current_command);
	}

	if (stage_out_batch.Number() > 0) {
		std::string constraint = "";
		stage_out_batch.Rewind();
		int jobsexpected = stage_out_batch.Number();
		while (stage_out_batch.Next(current_command)) {
			sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||",
									current_command->cluster_id,
									current_command->proc_id );
		}
		constraint += "False";

		error = FALSE;
		errstack.clear();
		int jobssent;
		if (!dc_schedd.receiveJobSandbox( constraint.c_str(),
										  &errstack, &jobssent )) {
			error = TRUE;
			sprintf( error_msg, "Error receiving files from schedd %s: %s",
							   ScheddAddr, errstack.getFullText() );
			dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
		}

		if(error == FALSE && jobssent != jobsexpected) {
			error = TRUE;
			sprintf( error_msg, "Schedd %s didn't send expected files",
					 ScheddAddr );
			dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str());
		}
  
		stage_out_batch.Rewind();
		while (stage_out_batch.Next(current_command)) {
			current_command->status = SchedDRequest::SDCS_COMPLETED;

			if (error) {
				const char * result[] = {
								GAHP_RESULT_FAILURE,
								error_msg.c_str() };
				enqueue_result (current_command->request_id, result, 2);

			} else {
				const char * result[] = {
										GAHP_RESULT_SUCCESS,
										NULL };
				enqueue_result (current_command->request_id, result, 2);
			}
		} // elihw (command_queue)
	} // fi has STAGE_OUT requests


	dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n");

	CondorVersionInfo ver_info(dc_schedd.version());
	bool delegate_credential;
	if ( ver_info.built_since_version(6,7,19) &&
		 param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) {
		delegate_credential = true;
	} else {
		delegate_credential = false;
	}

	// JOB_REFRESH_PROXY
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {

		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY)
			continue;

		time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad);
		time_t result_expiration_time = 0;

		bool result;
		errstack.clear();
		if ( delegate_credential ) {
			result = dc_schedd.delegateGSIcredential( 
												current_command->cluster_id,
												current_command->proc_id,
												current_command->proxy_file,
												expiration_time,
												&result_expiration_time,
												&errstack );

				// Currently, we do not propagate the actual resulting
				// expiration time back to the gridmanager.  We
				// probably should.
		} else {
			result = dc_schedd.updateGSIcredential( 
												current_command->cluster_id,
												current_command->proc_id,
												current_command->proxy_file,
												&errstack );
		}

		current_command->status = SchedDRequest::SDCS_COMPLETED;

		if (result == false) {
			sprintf( error_msg, "Error refreshing proxy to schedd %s: %s",
					 ScheddAddr, errstack.getFullText() );
			dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );

			const char * result_to_queue[] = {
				GAHP_RESULT_FAILURE,
				error_msg.c_str() };
			enqueue_result (current_command->request_id, result_to_queue, 2);

		} else {
			const char * result_to_queue[] = {
				GAHP_RESULT_SUCCESS,
				NULL };
			enqueue_result (current_command->request_id, result_to_queue, 2);
		}

	}


	// Now do all the QMGMT transactions
	error = FALSE;

	// Try connecting to the queue
	Qmgr_connection * qmgr_connection;
	
	if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) {
		error = TRUE;
		sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr );
		dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
	} else {
		errno = 0;
		AbortTransaction(); // Just so we can call BeginTransaction() in the loop
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			failure_errno = errno;
			goto contact_schedd_disconnect;
		}
	}


	dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n");
	
	// UPDATE_CONSTRAINED
	// UDATE_JOB
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {
		
		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) &&
			(current_command->command != SchedDRequest::SDC_UPDATE_JOB))
			continue;

		if (qmgr_connection == NULL)
			goto update_report_result;
		
		error = FALSE;
		errno = 0;
		BeginTransaction();
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			failure_errno = errno;
			goto contact_schedd_disconnect;
		}

		current_command->classad->ResetExpr();
		ExprTree *tree;
		const char *lhstr, *rhstr;
		while( current_command->classad->NextExpr(lhstr, tree) ) {

			rhstr = ExprTreeToString( tree );
			if( !lhstr || !rhstr) {
				sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s",
												 current_command->constraint );
				dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
				error = TRUE;
			} else {
				if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) {
					if( SetAttributeByConstraint(current_command->constraint,
												lhstr,
												rhstr) == -1 ) {
						if ( errno == ETIMEDOUT ) {
							failure_line_num = __LINE__;
							failure_errno = errno;
							goto contact_schedd_disconnect;
						}
						sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s",
									errno, lhstr, rhstr, current_command->constraint );
						dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
						error = TRUE;
					}
				} else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) {
					if( SetAttribute(current_command->cluster_id,
											current_command->proc_id,
											lhstr,
											rhstr) == -1 ) {
						if ( errno == ETIMEDOUT ) {
							failure_line_num = __LINE__;
							failure_errno = errno;
							goto contact_schedd_disconnect;
						}
						sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d",
										 lhstr, rhstr, current_command->cluster_id,  current_command->proc_id);
						dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
						error = TRUE;
					}
				}
			}

			if (error)
				break;
		} // elihw classad

update_report_result:
		if (error) {
			const char * result[] = {
				GAHP_RESULT_FAILURE,
				error_msg.c_str() };


			//RemoteCommitTransaction();
			enqueue_result (current_command->request_id, result, 2);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
			if ( qmgr_connection != NULL ) {
				errno = 0;
				AbortTransaction();
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					failure_errno = errno;
					goto contact_schedd_disconnect;
				}
			}
		} else {
			if ( RemoteCommitTransaction() < 0 ) {
				failure_line_num = __LINE__;
				failure_errno = errno;
				goto contact_schedd_disconnect;
			}
			const char * result[] = {
				GAHP_RESULT_SUCCESS,
				NULL };
			enqueue_result (current_command->request_id, result, 2);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		} // fi

	} // elihw

	
	dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n");

	// UPDATE_LEASE
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {
		
		error = FALSE;

		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE)
			continue;

		std::string success_job_ids="";
		if (qmgr_connection == NULL) {
			sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr );
			error = TRUE;
		} else {
			error = FALSE;
			errno = 0;
			BeginTransaction();
			if ( errno == ETIMEDOUT ) {
				failure_line_num = __LINE__;
				failure_errno = errno;
				goto contact_schedd_disconnect;
			}
		
			for (i=0; i<current_command->num_jobs; i++) {
			
				time_t time_now = time(NULL);
				int duration = 
					current_command->expirations[i].expiration - time_now;

				dprintf (D_FULLDEBUG, 
						 "Job %d.%d SetTimerAttribute=%d\n",
						 current_command->expirations[i].cluster,
						 current_command->expirations[i].proc,
						 duration);
		
				if (SetTimerAttribute (current_command->expirations[i].cluster,
									   current_command->expirations[i].proc,
									   ATTR_TIMER_REMOVE_CHECK,
									   duration) < 0) {

					if ( errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						failure_errno = errno;
						goto contact_schedd_disconnect;
					}
					dprintf (D_ALWAYS, 
							 "Unable to SetTimerAttribute(%d, %d), errno=%d\n",
							 current_command->expirations[i].cluster,
							 current_command->expirations[i].proc,
							 errno);
						 
				} else {
						// Append job id to the result line
					if (success_job_ids.length() > 0)
						success_job_ids += ",";

					sprintf_cat( success_job_ids,
						"%d.%d",
						current_command->expirations[i].cluster,
						current_command->expirations[i].proc);
				}
			} //rof jobs for request
		} // fi error


		if (error) {
			const char * result[] = {
				GAHP_RESULT_FAILURE,
				error_msg.c_str(),
				NULL
			};


			//RemoteCommitTransaction();
			enqueue_result (current_command->request_id, result, 3);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
			if ( qmgr_connection != NULL ) {
				errno = 0;
				AbortTransaction();
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					failure_errno = errno;
					goto contact_schedd_disconnect;
				}
			}
		} else {
			if ( RemoteCommitTransaction() < 0 ) {
				failure_line_num = __LINE__;
				failure_errno = errno;
				goto contact_schedd_disconnect;
			}
			const char * result[] = {
				GAHP_RESULT_SUCCESS,
				NULL,
				success_job_ids.length()?success_job_ids.c_str():NULL
			};
			enqueue_result (current_command->request_id, result, 3);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		} // fi

	} // elihw UPDATE_LEASE requests

	dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n");

	// SUBMIT_JOB
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {

		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB)
			continue;

		int ClusterId = -1;
		int ProcId = -1;

		if (qmgr_connection == NULL) {
			error = TRUE;
			goto submit_report_result;
		}

		errno = 0;
		BeginTransaction();
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			failure_errno = errno;
			goto contact_schedd_disconnect;
		}
		error = FALSE;

		if ((ClusterId = NewCluster()) >= 0) {
			ProcId = NewProc (ClusterId);
		}
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			failure_errno = errno;
			goto contact_schedd_disconnect;
		}

		if ( ClusterId < 0 ) {
			error = TRUE;
			error_msg = "Unable to create a new job cluster";
			dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
		} else if ( ProcId < 0 ) {
			error = TRUE;
			error_msg = "Unable to create a new job proc";
			dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
		}
		if ( ClusterId == -2 || ProcId == -2 ) {
			error = TRUE;
			error_msg =
				"Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n";
			dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
		}


		// Adjust the argument/environment syntax based on the version
		// of the schedd we are talking to.

		if( error == FALSE) {
			CondorVersionInfo version_info(dc_schedd.version());
			ArgList arglist;
			MyString arg_error_msg;
			Env env_obj;
			MyString env_error_msg;

			if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) ||
		   !	arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg))
			{
				sprintf( error_msg,
						"ERROR: ClassAd problem in converting arguments to syntax "
						"for schedd (version=%s): %s\n",
						dc_schedd.version() ? dc_schedd.version() : "NULL",
						arg_error_msg.Value());
				dprintf( D_ALWAYS,"%s\n", error_msg.c_str() );
				error = TRUE;
			}	

			if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) ||
			   !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info))
			{
				sprintf( error_msg,
						"ERROR: Failed to convert environment to target syntax"
						" for schedd (version %s): %s\n",
						dc_schedd.version() ? dc_schedd.version() : "NULL",
						env_error_msg.Value());
				dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
				error = TRUE;
			}
		}

		if( error == FALSE ) {
				// See the comment in the function body of ExpandInputFileList
				// for an explanation of what is going on here.
			MyString transfer_input_error_msg;
			if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) {
				dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() );
				error = TRUE;
			}
		}

		if ( error == FALSE ) {
			current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId);
			current_command->classad->Assign(ATTR_PROC_ID, ProcId);

			// Special case for the job lease
			int expire_time;
			if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) {
				if ( SetTimerAttribute( ClusterId, ProcId,
										ATTR_TIMER_REMOVE_CHECK,
										expire_time - time(NULL) ) == -1 ) {
					if ( errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						failure_errno = errno;
						goto contact_schedd_disconnect;
					}
					sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d",
							 ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId );
					dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
					error = TRUE;
					goto submit_report_result;
				}
				current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK );
			}

			// Set all the classad attribute on the remote classad
			current_command->classad->ResetExpr();
			ExprTree *tree;
			const char *lhstr, *rhstr;
			while( current_command->classad->NextExpr(lhstr, tree) ) {

				rhstr = ExprTreeToString( tree );
				if( !lhstr || !rhstr) {
					sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s",
												 current_command->constraint );
					dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
					error = TRUE;
				} else if( SetAttribute (ClusterId, ProcId,
											lhstr,
											rhstr) == -1 ) {
					if ( errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						failure_errno = errno;
						goto contact_schedd_disconnect;
					}
					sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d",
									 lhstr, rhstr, ClusterId, ProcId );
					dprintf( D_ALWAYS, "%s\n", error_msg.c_str() );
					error = TRUE;
				}

				if (error) break;
			} // elihw classad
		} // fi error==FALSE

submit_report_result:
		char job_id_buff[30];
		sprintf (job_id_buff, "%d.%d", ClusterId, ProcId);

		if (error) {
			const char * result[] = {
								GAHP_RESULT_FAILURE,
								job_id_buff,
								error_msg.c_str() };
			enqueue_result (current_command->request_id, result, 3);
			if ( qmgr_connection != NULL ) {
				errno = 0;
				AbortTransaction();
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					failure_errno = errno;
					goto contact_schedd_disconnect;
				}
			}
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		} else {
			if ( RemoteCommitTransaction() < 0 ) {
				failure_line_num = __LINE__;
				failure_errno = errno;
				goto contact_schedd_disconnect;
			}
			const char * result[] = {
									GAHP_RESULT_SUCCESS,
									job_id_buff,
									NULL };
			enqueue_result (current_command->request_id, result, 3);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		}
	} // elihw


	dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n");
		
	// STATUS_CONSTRAINED
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {

		if (current_command->status != SchedDRequest::SDCS_NEW)
			continue;

		if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED)
			continue;

		if (qmgr_connection != NULL) {
			SimpleList <MyString *> matching_ads;

			error = FALSE;
			
			ClassAd *next_ad;
			ClassAdList adlist;
				// Only use GetAllJobsByConstraint if remote schedd is
				// 6.9.5 or newer.  Previous versions either did not
				// support this call, or they closed the Qmgmt connection
				// as a side-effect of this call.
			if( ver_info.built_since_version(6,9,5) ) {
				dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n",
						 current_command->constraint );
					// NOTE: this could be made more efficient if we knew
					// the list of attributes to query.  For lack of that,
					// we just get all attributes.
				GetAllJobsByConstraint( current_command->constraint, "", adlist);
			}
			else {
					// This is the old latency-prone method.
				dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n",
						 current_command->constraint );
				next_ad = GetNextJobByConstraint( current_command->constraint, 1 );
				while( next_ad != NULL ) {
					adlist.Insert( next_ad );
					next_ad = GetNextJobByConstraint( current_command->constraint, 0 );
				}
			}

				// NOTE: ClassAdList will deallocate the ClassAds in it

			adlist.Rewind();
			while( (next_ad=adlist.Next()) ) {
				MyString * da_buffer = new MyString();	// Use a ptr to avoid excessive copying
				if ( useXMLClassads ) {
					ClassAdXMLUnparser unparser;
					unparser.SetUseCompactSpacing(true);
					unparser.Unparse (next_ad, *da_buffer);
				} else {
					NewClassAdUnparser unparser;
					unparser.SetUseCompactSpacing(true);
					unparser.Unparse (next_ad, *da_buffer);
				}
				matching_ads.Append (da_buffer);
			}
			if ( errno == ETIMEDOUT ) {
				failure_line_num = __LINE__;
				failure_errno = errno;
				goto contact_schedd_disconnect;
			}

			// now output this list of classads into a result
			const char ** result  = new const char* [matching_ads.Length() + 3];

			std::string _ad_count;
			sprintf( _ad_count, "%d", matching_ads.Length() );

			int count=0;
			result[count++] = GAHP_RESULT_SUCCESS;
			result[count++] = NULL;
			result[count++] = _ad_count.c_str();

			MyString *next_string;
			matching_ads.Rewind();
			while (matching_ads.Next(next_string)) {
				result[count++] = next_string->Value();
			}

			enqueue_result (current_command->request_id, result, count);
			current_command->status = SchedDRequest::SDCS_COMPLETED;

			// Cleanup
			matching_ads.Rewind();
			while (matching_ads.Next(next_string)) {
				delete next_string;
			}
			//CommitTransaction();
			delete [] result;
		}
		else {
			const char * result[] = {
				GAHP_RESULT_FAILURE,
				error_msg.c_str(),
				"0" };
			//RemoteCommitTransaction();
			enqueue_result (current_command->request_id, result, 3);
			current_command->status = SchedDRequest::SDCS_COMPLETED;
		}
	}	//elihw

	
 contact_schedd_disconnect:
	if ( qmgr_connection != NULL ) {
		DisconnectQ (qmgr_connection, FALSE);
	}

	if ( failure_line_num ) {
			// We had an error talking to the schedd. Take all of our
			// incomplete commands and mark them as failed.
			// TODO Consider retrying these commands, rather than
			//   immediately marking them as failed.
		if ( failure_errno == ETIMEDOUT ) {
			dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in "
					 "doContactSchedd()\n", failure_line_num );
			sprintf( error_msg, "Timed out talking to schedd" );
		} else {
			dprintf( D_ALWAYS, "Error talking to schedd at line %d in "
					 "doContactSchedd(), errno=%d (%s)\n", failure_line_num,
					 failure_errno, strerror(failure_errno) );
			sprintf( error_msg, "Error talking to schedd" );
		}
		command_queue.Rewind();
		while (command_queue.Next(current_command)) {
			if ( current_command->status != SchedDRequest::SDCS_NEW ) {
				continue;
			}
			switch( current_command->command ) {
			case SchedDRequest::SDC_UPDATE_JOB:
			case SchedDRequest::SDC_UPDATE_CONSTRAINED:
			{
				const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() };
				enqueue_result (current_command->request_id, result, 2);
				current_command->status = SchedDRequest::SDCS_COMPLETED;
			}
				break;
			case SchedDRequest::SDC_UPDATE_LEASE:
			{
				const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL };
				enqueue_result (current_command->request_id, result, 3);
				current_command->status = SchedDRequest::SDCS_COMPLETED;
			}
				break;
			case SchedDRequest::SDC_SUBMIT_JOB:
			{
				const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() };
				enqueue_result (current_command->request_id, result, 3);
				current_command->status = SchedDRequest::SDCS_COMPLETED;
			}
				break;
			case SchedDRequest::SDC_STATUS_CONSTRAINED:
			{
				const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" };
				enqueue_result (current_command->request_id, result, 3);
				current_command->status = SchedDRequest::SDCS_COMPLETED;
			}
				break;
			default:
					// Do nothing
				;
			}
		}
	}

	if ( do_reschedule ) {
		dc_schedd.reschedule();
	}

		// Write all of our results to our parent.
	flush_results();

	dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n");

	// Clean up the list
	command_queue.Rewind();
	while (command_queue.Next(current_command)) {
		if (current_command->status == SchedDRequest::SDCS_COMPLETED) {
			command_queue.DeleteCurrent();
			delete current_command;
		}
	}

	// Come back soon..
	// QUESTION: Should this always be a fixed time period?
	daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval );
}
Example #7
0
void
do_process_request(const ClassAd *inputAd, ClassAd *resultAd, const int req_number, 
				   const char *iwd, const char *stdio_iwd)
{
		// Check for inputAd
	if ( !inputAd ) {
		handle_process_request_error("No input ad",req_number,resultAd);
		return;
	}

		// Map the CMD specified in the input via the config file.
	MyString UnmappedJobName,JobName;
	if (inputAd->LookupString(ATTR_JOB_CMD,UnmappedJobName) == 0 ) {
			// no CMD specified.
		handle_process_request_error("No CMD specified",req_number,resultAd);
		return;
	}
	char *auth_commands = param("SOAPSHELL_AUTHORIZED_COMMANDS");
	StringList auth_list(auth_commands,",");
	if ( auth_commands ) free(auth_commands);
		// Each command needs four tuples; anything else is a misconfiguration
	if ( auth_list.number() % 4 != 0 ) {
		handle_process_request_error("Service is misconfigured: SOAPSHELL_AUTHORIZED_COMMANDS malformed",req_number,resultAd);
		return;
	}

	if ( auth_list.contains_anycase(UnmappedJobName.Value()) == TRUE ) {
		JobName = auth_list.next();
	}
	if ( JobName.IsEmpty() ) {
			// the CMD not authorized
		handle_process_request_error("Requested CMD not authorized via SOAPSHELL_AUTHORIZED_COMMANDS",req_number,resultAd);
		return;
	}

		// handle command line arguments.
	ArgList args;
	args.SetArgV1SyntaxToCurrentPlatform();
	args.AppendArg(JobName.Value());	// set argv[0] to command
	char *soapshell_args = auth_list.next();
	if ( soapshell_args && strcmp(soapshell_args,"*") ) {
		if(!args.AppendArgsV1RawOrV2Quoted(soapshell_args,NULL)) {
			dprintf( D_ALWAYS, "ERROR: SOAPSHELL_ARGS config macro invalid\n" );
		}
	} else if(!args.AppendArgsFromClassAd(inputAd,NULL)) {
		handle_process_request_error("Failed to setup CMD arguments",req_number,resultAd);
		return;
	}
		
		// handle the environment.
	Env job_env;
	char *env_str = auth_list.next();
	if ( env_str && strcmp(env_str,"*") ) {
		if(!job_env.MergeFromV1RawOrV2Quoted(env_str,NULL) ) {
			dprintf(D_ALWAYS,"ERROR: SOAPSHELL_ENVIRONMENT config macro invalid\n");
		}
	} else if(!job_env.MergeFrom(inputAd,NULL)) {
		// bad environment string in job ad!
		handle_process_request_error("Request has faulty environment string",req_number,resultAd);
		return;
	}

		// Write input files into iwd (we will write stdin later)
	if ( !write_input_files(inputAd, iwd) ) {
		// failed to write input files
		handle_process_request_error("Failed to write input files",req_number,resultAd);
		return;
	}

		// handle stdin, stdout, and stderr redirection
	const char* jobstdin_ = dircat(stdio_iwd,"stdin");
	MyString jobstdin(jobstdin_);
	const char* jobstdout_ = dircat(stdio_iwd,"stdout");
	MyString jobstdout(jobstdout_);
	const char* jobstderr_ = dircat(stdio_iwd,"stderr");
	MyString jobstderr(jobstderr_);
	delete [] jobstdin_;
	delete [] jobstdout_;
	delete [] jobstderr_;
	int flags = O_WRONLY | O_CREAT | O_TRUNC | O_APPEND | O_LARGEFILE;
		// write stdin file is needed
	{
		char *input = NULL;
		unsigned char *output = NULL;
		int output_length = 0;
		int fd = -1;
		inputAd->LookupString(ATTR_JOB_INPUT,&input);
		if ( input ) {
			// Caller needs to free *output if non-NULL
			condor_base64_decode(input,&output,&output_length);
			if ( output ) {
				fd = safe_open_wrapper_follow( jobstdin.Value(), flags, 0666 );
				if ( fd > -1 ) {
					write(fd,output,output_length);
					close(fd);
				}
				free(output);
			}
			free(input);
			if ( fd < 0 ) {
				handle_process_request_error("Failed to write stdin",req_number,resultAd);
				return;
			}
		}
	}
	int fds[3]; 
		// initialize these to -2 to mean they're not specified.
		// -1 will be treated as an error.
	fds[0] = -2; fds[1] = -2; fds[2] = -2;	
	fds[0] = safe_open_wrapper_follow( jobstdin.Value(), O_RDONLY | O_LARGEFILE ); // stdin	
	fds[1] = safe_open_wrapper_follow( jobstdout.Value(), flags, 0666 );	// stdout
	fds[2] = safe_open_wrapper_follow( jobstderr.Value(), flags, 0666 );	// stderr
	/* Bail out if we couldn't open stdout/err files correctly */
	if( fds[1]==-1 || fds[2]==-1 ) {
		/* only close ones that had been opened correctly */
		for ( int i = 0; i <= 2; i++ ) {
			if ( fds[i] >= 0 ) {
				daemonCore->Close_FD ( fds[i] );
			}
		}
		handle_process_request_error("Failed to write stdout/err files",req_number,resultAd);
		return;
	}

		// Print what we are about to do to the log
	MyString args_string;
	args.GetArgsStringForDisplay(&args_string,1);
	dprintf( D_ALWAYS, "About to exec %s %s\n", JobName.Value(),
				 args_string.Value() );

		// Spawn a process, baby!!!
	int JobPid = daemonCore->Create_Process( JobName.Value(),	// executable
		                                     args,				// args
		                                     PRIV_UNKNOWN,		// priv_state - TODO
		                                     0,					// reaper id - TODO
		                                     FALSE,				// want_command_port
		                                     &job_env,			// job environment
		                                     iwd,				// job iwd
		                                     NULL,				// family_info - TODO
		                                     NULL,				// sock_inherit_list
		                                     fds				// stdio redirection
										);

		// NOTE: Create_Process() saves the errno for us if it is an
		// "interesting" error.
	char const *create_process_error = NULL;
	if(JobPid == FALSE && errno) create_process_error = strerror(errno);

		// now close the descriptors in fds array.  our child has inherited
		// them already, so we should close them so we do not leak descriptors.
	for ( int i = 0; i <= 2; i++ ) {
		if ( fds[i] >= 0 ) {
			daemonCore->Close_FD ( fds[i] );
		}
	}

	if ( JobPid == FALSE ) {
		JobPid = -1;
		MyString errormsg;
		errormsg.formatstr("Create_Process failed %s",create_process_error ? create_process_error : "");
		handle_process_request_error(errormsg.Value(),req_number,resultAd);
		return;
	}


	dprintf(D_ALWAYS,"Create_Process succeeded, pid=%d\n",JobPid);

		// TODO - For now, just deal w/ one at a time. :(
		// So for now just wait for the child to exit.
#ifdef WIN32
#error This service does not yet work on Windows
#else
	{
		int exit_status;
		pid_t pid;
		for (;;) {
			pid = wait(&exit_status);
			dprintf(D_FULLDEBUG,"WAIT returned %d, errno=%d\n",pid,errno);
			if (pid == JobPid ) break;
			if (pid == -1 && errno != EINTR) {
				EXCEPT("waitpid failed errno=%d",errno);
			}
		}
		if ( WIFEXITED(exit_status) ) {
			int status = WEXITSTATUS(exit_status);
			resultAd->Assign("EXIT_STATUS",status);
		}		
	}
#endif

		// Job has completed, exit status is in the ad.  Now put
		// the output files into the result ad.
	stash_output_file(resultAd, jobstdout.Value(), ATTR_JOB_OUTPUT);
	stash_output_file(resultAd, jobstderr.Value(), ATTR_JOB_ERROR);

}
Example #8
0
int JavaProc::StartJob()
{
	
	MyString java_cmd;
	char* jarfiles = NULL;
	ArgList args;
	MyString arg_buf;

	// Since we are adding to the argument list, we may need to deal
	// with platform-specific arg syntax in the user's args in order
	// to successfully merge them with the additional java VM args.
	args.SetArgV1SyntaxToCurrentPlatform();

	// Construct the list of jar files for the command line
	// If a jar file is transferred locally, use its local name
	// (in the execute directory)
	// otherwise use the original name

	StringList jarfiles_orig_list;
	StringList jarfiles_local_list;
	StringList* jarfiles_final_list = NULL;

	if( JobAd->LookupString(ATTR_JAR_FILES,&jarfiles) ) {
		jarfiles_orig_list.initializeFromString( jarfiles );
		free( jarfiles );
		jarfiles = NULL;

		char * jarfile_name;
		const char * base_name;
		struct stat stat_buff;
		if( Starter->jic->iwdIsChanged() ) {
				// If the job's IWD has been changed (because we're
				// running in the sandbox due to file transfer), we
				// need to use a local version of the path to the jar
				// files, not the full paths from the submit machine. 
			jarfiles_orig_list.rewind();
			while( (jarfile_name = jarfiles_orig_list.next()) ) {
					// Construct the local name
				base_name = condor_basename( jarfile_name );
				MyString local_name = execute_dir;
				local_name += DIR_DELIM_CHAR;
				local_name += base_name; 

				if( stat(local_name.Value(), &stat_buff) == 0 ) {
						// Jar file exists locally, use local name
					jarfiles_local_list.append( local_name.Value() );
				} else {
						// Use the original name
					jarfiles_local_list.append (jarfile_name);
				}
			} // while(jarfiles_orig_list)

				// jarfiles_local_list is our real copy...
			jarfiles_final_list = &jarfiles_local_list;

		} else {  // !iwdIsChanged()

				// just use jarfiles_orig_list as our real copy...
			jarfiles_final_list = &jarfiles_orig_list;
		}			
	}

	startfile.formatstr("%s%cjvm.start",execute_dir,DIR_DELIM_CHAR);
	endfile.formatstr("%s%cjvm.end",execute_dir,DIR_DELIM_CHAR);

	if( !java_config(java_cmd,&args,jarfiles_final_list) ) {
		dprintf(D_FAILURE|D_ALWAYS,"JavaProc: Java is not configured!\n");
		return 0;
	}

	JobAd->Assign(ATTR_JOB_CMD, java_cmd.Value());

	arg_buf.formatstr("-Dchirp.config=%s%cchirp.config",execute_dir,DIR_DELIM_CHAR);
	args.AppendArg(arg_buf.Value());

	char *jvm_args1 = NULL;
	char *jvm_args2 = NULL;
	MyString jvm_args_error;
	bool jvm_args_success = true;
	JobAd->LookupString(ATTR_JOB_JAVA_VM_ARGS1, &jvm_args1);
	JobAd->LookupString(ATTR_JOB_JAVA_VM_ARGS2, &jvm_args2);
	if(jvm_args2) {
		jvm_args_success = args.AppendArgsV2Raw(jvm_args2, &jvm_args_error);
	}
	else if(jvm_args1) {
		jvm_args_success = args.AppendArgsV1Raw(jvm_args1, &jvm_args_error);
	}
	free(jvm_args1);
	free(jvm_args2);
	if (!jvm_args_success) {
		dprintf(D_ALWAYS, "JavaProc: failed to parse JVM args: %s\n",
				jvm_args_error.Value());
		return 0;
	}

	args.AppendArg("CondorJavaWrapper");
	args.AppendArg(startfile.Value());
	args.AppendArg(endfile.Value());

	MyString args_error;
	if(!args.AppendArgsFromClassAd(JobAd,&args_error)) {
		dprintf(D_ALWAYS,"JavaProc: failed to read job arguments: %s\n",
				args_error.Value());
		return 0;
	}

	// We are just talking to ourselves, so it is fine to use argument
	// syntax compatible with this current version of Condor.
	CondorVersionInfo ver_info;
	if(!args.InsertArgsIntoClassAd(JobAd,&ver_info,&args_error)) {
		dprintf(D_ALWAYS,"JavaProc: failed to insert java job arguments: %s\n",
				args_error.Value());
		return 0;
	}

	dprintf(D_ALWAYS,"JavaProc: Cmd=%s\n",java_cmd.Value());
	MyString args_string;
	args.GetArgsStringForDisplay(&args_string);
	dprintf(D_ALWAYS,"JavaProc: Args=%s\n",args_string.Value());

	return VanillaProc::StartJob();
}
Example #9
0
std::string *NordugridJob::buildSubmitRSL()
{
	int transfer_exec = TRUE;
	std::string *rsl = new std::string;
	StringList *stage_list = NULL;
	StringList *stage_local_list = NULL;
	char *attr_value = NULL;
	std::string rsl_suffix;
	std::string iwd;
	std::string executable;

	if ( jobAd->LookupString( ATTR_NORDUGRID_RSL, rsl_suffix ) &&
						   rsl_suffix[0] == '&' ) {
		*rsl = rsl_suffix;
		return rsl;
	}

	if ( jobAd->LookupString( ATTR_JOB_IWD, iwd ) != 1 ) {
		errorString = "ATTR_JOB_IWD not defined";
		delete rsl;
		return NULL;
	}

	//Start off the RSL
	attr_value = param( "FULL_HOSTNAME" );
	formatstr( *rsl, "&(savestate=yes)(action=request)(hostname=%s)", attr_value );
	free( attr_value );
	attr_value = NULL;

	//We're assuming all job clasads have a command attribute
	jobAd->LookupString( ATTR_JOB_CMD, executable );
	jobAd->LookupBool( ATTR_TRANSFER_EXECUTABLE, transfer_exec );

	*rsl += "(executable=";
	// If we're transferring the executable, strip off the path for the
	// remote machine, since it refers to the submit machine.
	if ( transfer_exec ) {
		*rsl += condor_basename( executable.c_str() );
	} else {
		*rsl += executable;
	}

	{
		ArgList args;
		MyString arg_errors;
		MyString rsl_args;
		if(!args.AppendArgsFromClassAd(jobAd,&arg_errors)) {
			dprintf(D_ALWAYS,"(%d.%d) Failed to read job arguments: %s\n",
					procID.cluster, procID.proc, arg_errors.Value());
			formatstr(errorString,"Failed to read job arguments: %s\n",
					arg_errors.Value());
			delete rsl;
			return NULL;
		}
		if(args.Count() != 0) {
			if(args.InputWasV1()) {
					// In V1 syntax, the user's input _is_ RSL
				if(!args.GetArgsStringV1Raw(&rsl_args,&arg_errors)) {
					dprintf(D_ALWAYS,
							"(%d.%d) Failed to get job arguments: %s\n",
							procID.cluster,procID.proc,arg_errors.Value());
					formatstr(errorString,"Failed to get job arguments: %s\n",
							arg_errors.Value());
					delete rsl;
					return NULL;
				}
			}
			else {
					// In V2 syntax, we convert the ArgList to RSL
				for(int i=0;i<args.Count();i++) {
					if(i) {
						rsl_args += ' ';
					}
					rsl_args += rsl_stringify(args.GetArg(i));
				}
			}
			*rsl += ")(arguments=";
			*rsl += rsl_args;
		}
	}

	// If we're transferring the executable, tell Nordugrid to set the
	// execute bit on the transferred executable.
	if ( transfer_exec ) {
		*rsl += ")(executables=";
		*rsl += condor_basename( executable.c_str() );
	}

	if ( jobAd->LookupString( ATTR_JOB_INPUT, &attr_value ) == 1) {
		// only add to list if not NULL_FILE (i.e. /dev/null)
		if ( ! nullFile(attr_value) ) {
			*rsl += ")(stdin=";
			*rsl += condor_basename(attr_value);
		}
		free( attr_value );
		attr_value = NULL;
	}

	stage_list = buildStageInList();

	if ( stage_list->isEmpty() == false ) {
		char *file;
		stage_list->rewind();

		*rsl += ")(inputfiles=";

		while ( (file = stage_list->next()) != NULL ) {
			*rsl += "(";
			*rsl += condor_basename(file);
			if ( IsUrl( file ) ) {
				formatstr_cat( *rsl, " \"%s\")", file );
			} else {
				*rsl += " \"\")";
			}
		}
	}

	delete stage_list;
	stage_list = NULL;

	if ( jobAd->LookupString( ATTR_JOB_OUTPUT, &attr_value ) == 1) {
		// only add to list if not NULL_FILE (i.e. /dev/null)
		if ( ! nullFile(attr_value) ) {
			*rsl += ")(stdout=" REMOTE_STDOUT_NAME;
		}
		free( attr_value );
		attr_value = NULL;
	}

	if ( jobAd->LookupString( ATTR_JOB_ERROR, &attr_value ) == 1) {
		// only add to list if not NULL_FILE (i.e. /dev/null)
		if ( ! nullFile(attr_value) ) {
			*rsl += ")(stderr=" REMOTE_STDERR_NAME;
		}
		free( attr_value );
	}

	stage_list = buildStageOutList();
	stage_local_list = buildStageOutLocalList( stage_list );

	if ( stage_list->isEmpty() == false ) {
		char *file;
		char *local_file;
		stage_list->rewind();
		stage_local_list->rewind();

		*rsl += ")(outputfiles=";

		while ( (file = stage_list->next()) != NULL ) {
			local_file = stage_local_list->next();
			*rsl += "(";
			*rsl += condor_basename(file);
			if ( IsUrl( local_file ) ) {
				formatstr_cat( *rsl, " \"%s\")", local_file );
			} else {
				*rsl += " \"\")";
			}
		}
	}

	delete stage_list;
	stage_list = NULL;
	delete stage_local_list;
	stage_local_list = NULL;

	*rsl += ')';

	if ( !rsl_suffix.empty() ) {
		*rsl += rsl_suffix;
	}

dprintf(D_FULLDEBUG,"*** RSL='%s'\n",rsl->c_str());
	return rsl;
}