bool SharedPortEndpoint::ChownSocket(priv_state priv) { #ifndef HAVE_SHARED_PORT return false; #elif WIN32 return false; #elif HAVE_SCM_RIGHTS_PASSFD if( !can_switch_ids() ) { return true; } switch( priv ) { case PRIV_ROOT: case PRIV_CONDOR: case PRIV_CONDOR_FINAL: case PRIV_UNKNOWN: // Nothing needs to be done in this case, because the named // socket was created with condor ownership (we assume). return true; case PRIV_FILE_OWNER: case _priv_state_threshold: // these don't really make sense, but include them so // the compiler can warn about priv states not covered return true; case PRIV_USER: case PRIV_USER_FINAL: { priv_state orig_priv = set_root_priv(); int rc = fchown( m_listener_sock.get_file_desc(),get_user_uid(),get_user_gid() ); if( rc != 0 ) { dprintf(D_ALWAYS,"SharedPortEndpoint: failed to chown %s to %d:%d: %s.\n", m_full_name.Value(), get_user_uid(), get_user_gid(), strerror(errno)); } set_priv( orig_priv ); return rc == 0; } } EXCEPT("Unexpected priv state in SharedPortEndpoint(%d)",(int)priv); return false; #else #error HAVE_SHARED_PORT is defined, but no method for passing fds is enabled. #endif }
int pseudo_get_user_info(ClassAd *&ad) { static ClassAd* user_ad = NULL; if( ! user_ad ) { // if we don't have the ClassAd yet, allocate it and fill // it in with the info we care about user_ad = new ClassAd; #ifndef WIN32 char buf[1024]; sprintf( buf, "%s = %d", ATTR_UID, (int)get_user_uid() ); user_ad->Insert( buf ); sprintf( buf, "%s = %d", ATTR_GID, (int)get_user_gid() ); user_ad->Insert( buf ); #endif } ad = user_ad; return 0; }
int StoreData (const char * file_name, const void * data, const int data_size) { if (!data) { return FALSE; } priv_state priv = set_root_priv(); dprintf (D_FULLDEBUG, "in StoreData(), euid=%d\n", geteuid()); int fd = safe_open_wrapper_follow(file_name, O_WRONLY | O_CREAT | O_TRUNC, 0600 ); if (fd == -1) { dprintf (D_ALWAYS, "Unable to store in %s\n", file_name); set_priv(priv); return FALSE; } // Change to user owning the cred (assume init_user_ids() has been called) if (fchmod (fd, S_IRUSR | S_IWUSR)) { dprintf(D_ALWAYS, "Failed to fchmod %s to S_IRUSR | S_IWUSR: %s\n", file_name, strerror(errno)); } if (fchown (fd, get_user_uid(), get_user_gid())) { dprintf(D_ALWAYS, "Failed to fchown %s to %d.%d: %s\n", file_name, get_user_uid(), get_user_gid(), strerror(errno)); } int written = write (fd, data, data_size); if (written < data_size) { dprintf (D_ALWAYS, "Can't write to %s: (%d) \n", file_name, errno); set_priv(priv); close(fd); return FALSE; } close (fd); set_priv(priv); return TRUE; }
int VanillaProc::StartJob() { dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n"); // vanilla jobs, unlike standard jobs, are allowed to run // shell scripts (or as is the case on NT, batch files). so // edit the ad so we start up a shell, pass the executable as // an argument to the shell, if we are asked to run a .bat file. #ifdef WIN32 CHAR interpreter[MAX_PATH+1], systemshell[MAX_PATH+1]; const char* jobtmp = Starter->jic->origJobName(); int joblen = strlen(jobtmp); const char *extension = joblen > 0 ? &(jobtmp[joblen-4]) : NULL; bool binary_executable = ( extension && ( MATCH == strcasecmp ( ".exe", extension ) || MATCH == strcasecmp ( ".com", extension ) ) ), java_universe = ( CONDOR_UNIVERSE_JAVA == job_universe ); ArgList arguments; MyString filename, jobname, error; if ( extension && !java_universe && !binary_executable ) { /** since we do not actually know how long the extension of the file is, we'll need to hunt down the '.' in the path, if it exists */ extension = strrchr ( jobtmp, '.' ); if ( !extension ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): Failed to extract " "the file's extension.\n" ); /** don't fail here, since we want executables to run as usual. That is, some condor jobs submit executables that do not have the '.exe' extension, but are, nonetheless, executable binaries. For instance, a submit script may contain: executable = executable$(OPSYS) */ } else { /** pull out the path to the executable */ if ( !JobAd->LookupString ( ATTR_JOB_CMD, jobname ) ) { /** fall back on Starter->jic->origJobName() */ jobname = jobtmp; } /** If we transferred the job, it may have been renamed to condor_exec.exe even though it is not an executable. Here we rename it back to a the correct extension before it will run. */ if ( MATCH == strcasecmp ( CONDOR_EXEC, condor_basename ( jobname.Value () ) ) ) { filename.formatstr ( "condor_exec%s", extension ); if (rename(CONDOR_EXEC, filename.Value()) != 0) { dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: " "failed to rename executable from %s to %s\n", CONDOR_EXEC, filename.Value() ); } } else { filename = jobname; } /** Since we've renamed our executable, we need to update the job ad to reflect this change. */ if ( !JobAd->Assign ( ATTR_JOB_CMD, filename ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "set new executable name.\n" ); return FALSE; } /** We've moved the script to argv[1], so we need to add the remaining arguments to positions argv[2].. argv[/n/]. */ if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) || !arguments.InsertArgsIntoClassAd ( JobAd, NULL, &error ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "get arguments from job ad: %s\n", error.Value () ); return FALSE; } /** Since we know already we don't want this file returned to us, we explicitly add it to an exception list which will stop the file transfer mechanism from considering it for transfer back to its submitter */ Starter->jic->removeFromOutputFiles ( filename.Value () ); } } #endif // set up a FamilyInfo structure to tell OsProc to register a family // with the ProcD in its call to DaemonCore::Create_Process // FamilyInfo fi; // take snapshots at no more than 15 seconds in between, by default // fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15); m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated(); if( ThisProcRunsAlongsideMainProc() ) { // If we track a secondary proc's family tree (such as // sshd) using the same dedicated account as the job's // family tree, we could end up killing the job when we // clean up the secondary family. m_dedicated_account = NULL; } if (m_dedicated_account) { // using login-based family tracking fi.login = m_dedicated_account; // The following message is documented in the manual as the // way to tell whether the dedicated execution account // configuration is being used. dprintf(D_ALWAYS, "Tracking process family by login \"%s\"\n", fi.login); } FilesystemRemap * fs_remap = NULL; #if defined(LINUX) // on Linux, we also have the ability to track processes via // a phony supplementary group ID // gid_t tracking_gid = 0; if (param_boolean("USE_GID_PROCESS_TRACKING", false)) { if (!can_switch_ids() && (Starter->condorPrivSepHelper() == NULL)) { EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify " "the group list of our children unless running as " "root or using PrivSep"); } fi.group_ptr = &tracking_gid; } // Increase the OOM score of this process; the child will inherit it. // This way, the job will be heavily preferred to be killed over a normal process. // OOM score is currently exponential - a score of 4 is a factor-16 increase in // the OOM score. setupOOMScore(4,800); #endif #if defined(HAVE_EXT_LIBCGROUP) // Determine the cgroup std::string cgroup_base; param(cgroup_base, "BASE_CGROUP", ""); MyString cgroup_str; const char *cgroup = NULL; /* Note on CONDOR_UNIVERSE_LOCAL - The cgroup setup code below * requires a unique name for the cgroup. It relies on * uniqueness of the MachineAd's Name * attribute. Unfortunately, in the local universe the * MachineAd (mach_ad elsewhere) is never populated, because * there is no machine. As a result the ASSERT on * starter_name fails. This means that the local universe * will not work on any machine that has BASE_CGROUP * configured. A potential workaround is to set * STARTER.BASE_CGROUP on any machine that is also running a * schedd, but that disables cgroup support from a * co-resident startd. Instead, I'm disabling cgroup support * from within the local universe until the intraction of * local universe and cgroups can be properly worked * out. -matt 7 nov '12 */ if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup_base.length()) { MyString cgroup_uniq; std::string starter_name, execute_str; param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN"); // Note: Starter is a global variable from os_proc.cpp Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name); if (starter_name.size() == 0) { char buf[16]; sprintf(buf, "%d", getpid()); starter_name = buf; } //ASSERT (starter_name.size()); cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str()); const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'}; cgroup_uniq.replaceString(dir_delim, "_"); cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR, cgroup_uniq.Value()); cgroup_str += this->CgroupSuffix(); cgroup = cgroup_str.Value(); ASSERT (cgroup != NULL); fi.cgroup = cgroup; dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup); } #endif // The chroot stuff really only works on linux #ifdef LINUX { // Have Condor manage a chroot std::string requested_chroot_name; JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name); const char * allowed_root_dirs = param("NAMED_CHROOT"); if (requested_chroot_name.size()) { dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str()); StringList chroot_list(allowed_root_dirs); chroot_list.rewind(); const char * next_chroot; bool acceptable_chroot = false; std::string requested_chroot; while ( (next_chroot=chroot_list.next()) ) { MyString chroot_spec(next_chroot); chroot_spec.Tokenize(); const char * chroot_name = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } const char * next_dir = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value()); if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) { acceptable_chroot = true; requested_chroot = next_dir; } } // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit. // Is this the responsibility of Condor to check, or the sysadmin who set it up? if (!acceptable_chroot) { return FALSE; } dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str()); std::stringstream ss; std::stringstream ss2; ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid(); std::string execute_dir = ss2.str(); ss << requested_chroot << DIR_DELIM_CHAR << ss2.str(); std::string full_dir_str = ss.str(); if (is_trivial_rootdir(requested_chroot)) { dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str()); } else if (IsDirectory(execute_dir.c_str())) { { TemporaryPrivSentry sentry(PRIV_ROOT); if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) { dprintf( D_FAILURE|D_ALWAYS, "Failed to create sandbox directory in chroot (%s): %s\n", full_dir_str.c_str(), strerror(errno) ); return FALSE; } if (chown(full_dir_str.c_str(), get_user_uid(), get_user_gid()) == -1) { EXCEPT("chown error on %s: %s", full_dir_str.c_str(), strerror(errno)); } } if (!fs_remap) { fs_remap = new FilesystemRemap(); } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str()); if (fs_remap->AddMapping(execute_dir, full_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/"); std::string root_str("/"); if (fs_remap->AddMapping(requested_chroot, root_str)) { return FALSE; } } else { dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str()); } } else { dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n"); } } // End of chroot #endif // On Linux kernel 2.4.19 and later, we can give each job its // own FS mounts. auto_free_ptr mount_under_scratch(param("MOUNT_UNDER_SCRATCH")); if (mount_under_scratch) { // try evaluating mount_under_scratch as a classad expression, if it is // an expression it must return a string. if it's not an expression, just // use it as a string (as we did before 8.3.6) classad::Value value; if (JobAd->EvaluateExpr(mount_under_scratch.ptr(), value)) { const char * pval = NULL; if (value.IsStringValue(pval)) { mount_under_scratch.set(strdup(pval)); } else { // was an expression, but not a string, so report and error and fail. dprintf(D_ALWAYS | D_ERROR, "ERROR: MOUNT_UNDER_SCRATCH does not evaluate to a string, it is : %s\n", ClassAdValueToString(value)); return FALSE; } } } // if execute dir is encrypted, add /tmp and /var/tmp to mount_under_scratch bool encrypt_execdir = false; JobAd->LookupBool(ATTR_ENCRYPT_EXECUTE_DIRECTORY,encrypt_execdir); if (encrypt_execdir || param_boolean_crufty("ENCRYPT_EXECUTE_DIRECTORY",false)) { // prepend /tmp, /var/tmp to whatever admin wanted. don't worry // if admin already listed /tmp etc - subdirs can appear twice // in this list because AddMapping() ok w/ duplicate entries MyString buf("/tmp,/var/tmp,"); buf += mount_under_scratch.ptr(); mount_under_scratch.set(buf.StrDup()); } if (mount_under_scratch) { std::string working_dir = Starter->GetWorkingDir(); if (IsDirectory(working_dir.c_str())) { StringList mount_list(mount_under_scratch); mount_list.rewind(); if (!fs_remap) { fs_remap = new FilesystemRemap(); } char * next_dir; while ( (next_dir=mount_list.next()) ) { if (!*next_dir) { // empty string? mount_list.deleteCurrent(); continue; } std::string next_dir_str(next_dir); // Gah, I wish I could throw an exception to clean up these nested if statements. if (IsDirectory(next_dir)) { char * full_dir = dirscat(working_dir, next_dir_str); if (full_dir) { std::string full_dir_str(full_dir); delete [] full_dir; full_dir = NULL; if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) { dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str()); delete fs_remap; return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str()); if (fs_remap->AddMapping(full_dir_str, next_dir_str)) { // FilesystemRemap object prints out an error message for us. delete fs_remap; return FALSE; } } else { dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str()); delete fs_remap; return FALSE; } } else { dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir); } } } else { dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str()); delete fs_remap; return FALSE; } mount_under_scratch.clear(); } #if defined(LINUX) // On Linux kernel 2.6.24 and later, we can give each // job its own PID namespace if (param_boolean("USE_PID_NAMESPACES", false)) { if (!can_switch_ids()) { EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this " "call in Linux unless running as root."); } fi.want_pid_namespace = this->SupportsPIDNamespace(); if (fi.want_pid_namespace) { if (!fs_remap) { fs_remap = new FilesystemRemap(); } fs_remap->RemapProc(); } // When PID Namespaces are enabled, need to run the job // under the condor_pid_ns_init program, so that signals // propagate through to the child. // First tell the program where to log output status // via an environment variable if (param_boolean("USE_PID_NAMESPACE_INIT", true)) { Env env; MyString env_errors; MyString arg_errors; std::string filename; filename = Starter->GetWorkingDir(); filename += "/.condor_pid_ns_status"; env.MergeFrom(JobAd, &env_errors); env.SetEnv("_CONDOR_PID_NS_INIT_STATUS_FILENAME", filename); env.InsertEnvIntoClassAd(JobAd, &env_errors); Starter->jic->removeFromOutputFiles(condor_basename(filename.c_str())); this->m_pid_ns_status_filename = filename; // Now, set the job's CMD to the wrapper, and shift // over the arguments by one ArgList args; std::string cmd; JobAd->LookupString(ATTR_JOB_CMD, cmd); args.AppendArg(cmd); args.AppendArgsFromClassAd(JobAd, &arg_errors); args.InsertArgsIntoClassAd(JobAd, NULL, & arg_errors); std::string libexec; if( !param(libexec,"LIBEXEC") ) { dprintf(D_ALWAYS, "Cannot find LIBEXEC so can not run condor_pid_ns_init\n"); return 0; } std::string c_p_n_i = libexec + "/condor_pid_ns_init"; JobAd->Assign(ATTR_JOB_CMD, c_p_n_i); } } dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false"); #endif // have OsProc start the job // int retval = OsProc::StartJob(&fi, fs_remap); if (fs_remap != NULL) { delete fs_remap; } #if defined(HAVE_EXT_LIBCGROUP) // Set fairshare limits. Note that retval == 1 indicates success, 0 is failure. // See Note near setup of param(BASE_CGROUP) if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup && retval) { std::string mem_limit; param(mem_limit, "CGROUP_MEMORY_LIMIT_POLICY", "soft"); bool mem_is_soft = mem_limit == "soft"; std::string cgroup_string = cgroup; CgroupLimits climits(cgroup_string); if (mem_is_soft || (mem_limit == "hard")) { ClassAd * MachineAd = Starter->jic->machClassAd(); int MemMb; if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) { uint64_t MemMb_big = MemMb; m_memory_limit = MemMb_big; climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft); // Note that ATTR_VIRTUAL_MEMORY on Linux // is sum of memory and swap, in Kilobytes int VMemKb; if (MachineAd->LookupInteger(ATTR_VIRTUAL_MEMORY, VMemKb)) { uint64_t memsw_limit = ((uint64_t)1024) * VMemKb; if (VMemKb > 0) { // we're not allowed to set memsw limit < // the hard memory limit. If we haven't set the hard // memory limit, the default may be infinity. // So, if we've set soft, set hard limit to memsw - one page if (mem_is_soft) { uint64_t hard_limit = memsw_limit - 4096; climits.set_memory_limit_bytes(hard_limit, false); } climits.set_memsw_limit_bytes(memsw_limit); } } else { dprintf(D_ALWAYS, "Not setting virtual memory limit in cgroup because " "Virtual Memory attribute missing in machine ad.\n"); } } else { dprintf(D_ALWAYS, "Not setting memory limit in cgroup because " "Memory attribute missing in machine ad.\n"); } } else if (mem_limit == "none") { dprintf(D_FULLDEBUG, "Not enforcing memory limit.\n"); } else { dprintf(D_ALWAYS, "Invalid value of CGROUP_MEMORY_LIMIT_POLICY: %s. Ignoring.\n", mem_limit.c_str()); } // Now, set the CPU shares ClassAd * MachineAd = Starter->jic->machClassAd(); int numCores = 1; if (MachineAd->LookupInteger(ATTR_CPUS, numCores)) { climits.set_cpu_shares(numCores*100); } else { dprintf(D_FULLDEBUG, "Invalid value of Cpus in machine ClassAd; ignoring.\n"); } setupOOMEvent(cgroup); } m_statistics.Reconfig(); // Now that the job is started, decrease the likelihood that the starter // is killed instead of the job itself. if (retval) { setupOOMScore(0,0); } #endif return retval; }
int VanillaProc::StartJob() { dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n"); // vanilla jobs, unlike standard jobs, are allowed to run // shell scripts (or as is the case on NT, batch files). so // edit the ad so we start up a shell, pass the executable as // an argument to the shell, if we are asked to run a .bat file. #ifdef WIN32 CHAR interpreter[MAX_PATH+1], systemshell[MAX_PATH+1]; const char* jobtmp = Starter->jic->origJobName(); int joblen = strlen(jobtmp); const char *extension = joblen > 0 ? &(jobtmp[joblen-4]) : NULL; bool binary_executable = ( extension && ( MATCH == strcasecmp ( ".exe", extension ) || MATCH == strcasecmp ( ".com", extension ) ) ), java_universe = ( CONDOR_UNIVERSE_JAVA == job_universe ); ArgList arguments; MyString filename, jobname, error; if ( extension && !java_universe && !binary_executable ) { /** since we do not actually know how long the extension of the file is, we'll need to hunt down the '.' in the path, if it exists */ extension = strrchr ( jobtmp, '.' ); if ( !extension ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): Failed to extract " "the file's extension.\n" ); /** don't fail here, since we want executables to run as usual. That is, some condor jobs submit executables that do not have the '.exe' extension, but are, nonetheless, executable binaries. For instance, a submit script may contain: executable = executable$(OPSYS) */ } else { /** pull out the path to the executable */ if ( !JobAd->LookupString ( ATTR_JOB_CMD, jobname ) ) { /** fall back on Starter->jic->origJobName() */ jobname = jobtmp; } /** If we transferred the job, it may have been renamed to condor_exec.exe even though it is not an executable. Here we rename it back to a the correct extension before it will run. */ if ( MATCH == strcasecmp ( CONDOR_EXEC, condor_basename ( jobname.Value () ) ) ) { filename.formatstr ( "condor_exec%s", extension ); if (rename(CONDOR_EXEC, filename.Value()) != 0) { dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: " "failed to rename executable from %s to %s\n", CONDOR_EXEC, filename.Value() ); } } else { filename = jobname; } /** Since we've renamed our executable, we need to update the job ad to reflect this change. */ if ( !JobAd->Assign ( ATTR_JOB_CMD, filename ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "set new executable name.\n" ); return FALSE; } /** We've moved the script to argv[1], so we need to add the remaining arguments to positions argv[2].. argv[/n/]. */ if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) || !arguments.InsertArgsIntoClassAd ( JobAd, NULL, &error ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "get arguments from job ad: %s\n", error.Value () ); return FALSE; } /** Since we know already we don't want this file returned to us, we explicitly add it to an exception list which will stop the file transfer mechanism from considering it for transfer back to its submitter */ Starter->jic->removeFromOutputFiles ( filename.Value () ); } } #endif // set up a FamilyInfo structure to tell OsProc to register a family // with the ProcD in its call to DaemonCore::Create_Process // FamilyInfo fi; // take snapshots at no more than 15 seconds in between, by default // fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15); m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated(); if( ThisProcRunsAlongsideMainProc() ) { // If we track a secondary proc's family tree (such as // sshd) using the same dedicated account as the job's // family tree, we could end up killing the job when we // clean up the secondary family. m_dedicated_account = NULL; } if (m_dedicated_account) { // using login-based family tracking fi.login = m_dedicated_account; // The following message is documented in the manual as the // way to tell whether the dedicated execution account // configuration is being used. dprintf(D_ALWAYS, "Tracking process family by login \"%s\"\n", fi.login); } FilesystemRemap * fs_remap = NULL; #if defined(LINUX) // on Linux, we also have the ability to track processes via // a phony supplementary group ID // gid_t tracking_gid = 0; if (param_boolean("USE_GID_PROCESS_TRACKING", false)) { if (!can_switch_ids() && (Starter->condorPrivSepHelper() == NULL)) { EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify " "the group list of our children unless running as " "root or using PrivSep"); } fi.group_ptr = &tracking_gid; } #endif #if defined(HAVE_EXT_LIBCGROUP) // Determine the cgroup std::string cgroup_base; param(cgroup_base, "BASE_CGROUP", ""); MyString cgroup_str; const char *cgroup = NULL; if (cgroup_base.length()) { MyString cgroup_uniq; std::string starter_name, execute_str; param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN"); // Note: Starter is a global variable from os_proc.cpp Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name); ASSERT (starter_name.size()); cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str()); const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'}; cgroup_uniq.replaceString(dir_delim, "_"); cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR, cgroup_uniq.Value()); cgroup = cgroup_str.Value(); ASSERT (cgroup != NULL); fi.cgroup = cgroup; dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup); } #endif // The chroot stuff really only works on linux #ifdef LINUX { // Have Condor manage a chroot std::string requested_chroot_name; JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name); const char * allowed_root_dirs = param("NAMED_CHROOT"); if (requested_chroot_name.size()) { dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str()); StringList chroot_list(allowed_root_dirs); chroot_list.rewind(); const char * next_chroot; bool acceptable_chroot = false; std::string requested_chroot; while ( (next_chroot=chroot_list.next()) ) { MyString chroot_spec(next_chroot); chroot_spec.Tokenize(); const char * chroot_name = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } const char * next_dir = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value()); if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) { acceptable_chroot = true; requested_chroot = next_dir; } } // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit. // Is this the responsibility of Condor to check, or the sysadmin who set it up? if (!acceptable_chroot) { return FALSE; } dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str()); std::stringstream ss; std::stringstream ss2; ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid(); std::string execute_dir = ss2.str(); ss << requested_chroot << DIR_DELIM_CHAR << ss2.str(); std::string full_dir_str = ss.str(); if (is_trivial_rootdir(requested_chroot)) { dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str()); } else if (IsDirectory(execute_dir.c_str())) { { TemporaryPrivSentry sentry(PRIV_ROOT); if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) { dprintf( D_FAILURE|D_ALWAYS, "Failed to create sandbox directory in chroot (%s): %s\n", full_dir_str.c_str(), strerror(errno) ); return FALSE; } if (chown(full_dir_str.c_str(), get_user_uid(), get_user_gid()) == -1) { EXCEPT("chown error on %s: %s", full_dir_str.c_str(), strerror(errno)); } } if (!fs_remap) { fs_remap = new FilesystemRemap(); } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str()); if (fs_remap->AddMapping(execute_dir, full_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/"); std::string root_str("/"); if (fs_remap->AddMapping(requested_chroot, root_str)) { return FALSE; } } else { dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str()); } } else { dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n"); } } // End of chroot #endif // On Linux kernel 2.4.19 and later, we can give each job its // own FS mounts. char * mount_under_scratch = param("MOUNT_UNDER_SCRATCH"); if (mount_under_scratch) { std::string working_dir = Starter->GetWorkingDir(); if (IsDirectory(working_dir.c_str())) { StringList mount_list(mount_under_scratch); free(mount_under_scratch); mount_list.rewind(); if (!fs_remap) { fs_remap = new FilesystemRemap(); } char * next_dir; while ( (next_dir=mount_list.next()) ) { if (!*next_dir) { // empty string? mount_list.deleteCurrent(); continue; } std::string next_dir_str(next_dir); // Gah, I wish I could throw an exception to clean up these nested if statements. if (IsDirectory(next_dir)) { char * full_dir = dirscat(working_dir, next_dir_str); if (full_dir) { std::string full_dir_str(full_dir); delete [] full_dir; full_dir = NULL; if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) { dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str()); return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str()); if (fs_remap->AddMapping(full_dir_str, next_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } } else { dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str()); return FALSE; } } else { dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir); } } } else { dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str()); return FALSE; } } // have OsProc start the job // int retval = OsProc::StartJob(&fi, fs_remap); if (fs_remap != NULL) { delete fs_remap; } #if defined(HAVE_EXT_LIBCGROUP) // Set fairshare limits. Note that retval == 1 indicates success, 0 is failure. if (cgroup && retval) { std::string mem_limit; param(mem_limit, "MEMORY_LIMIT", "soft"); bool mem_is_soft = mem_limit == "soft"; std::string cgroup_string = cgroup; CgroupLimits climits(cgroup_string); if (mem_is_soft || (mem_limit == "hard")) { ClassAd * MachineAd = Starter->jic->machClassAd(); int MemMb; if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) { uint64_t MemMb_big = MemMb; climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft); } else { dprintf(D_ALWAYS, "Not setting memory soft limit in cgroup because " "Memory attribute missing in machine ad.\n"); } } else if (mem_limit == "none") { dprintf(D_FULLDEBUG, "Not enforcing memory soft limit.\n"); } else { dprintf(D_ALWAYS, "Invalid value of MEMORY_LIMIT: %s. Ignoring.\n", mem_limit.c_str()); } // Now, set the CPU shares ClassAd * MachineAd = Starter->jic->machClassAd(); int slotWeight; if (MachineAd->LookupInteger(ATTR_SLOT_WEIGHT, slotWeight)) { climits.set_cpu_shares(slotWeight*100); } else { dprintf(D_FULLDEBUG, "Invalid value of SlotWeight in machine ClassAd; ignoring.\n"); } } #endif return retval; }
bool VMGahpServer::startUp(Env *job_env, const char *workingdir, int nice_inc, FamilyInfo *family_info) { //check if we already have spawned a vmgahp server if( m_vmgahp_pid > 0 ) { //vmgahp is already running return true; } if( !m_job_ad ) { start_err_msg = "No JobAd in VMGahpServer::startUp()"; dprintf(D_ALWAYS,"%s\n", start_err_msg.Value()); return false; } MyString JobName; if( m_vmgahp_server.IsEmpty() ) { start_err_msg = "No path for vmgahp in VMGahpServer::startUp()"; dprintf(D_ALWAYS,"%s\n", start_err_msg.Value()); return false; } JobName = m_vmgahp_server; // Create two pairs of pipes which we will use to int stdin_pipefds[2]; int stdout_pipefds[2]; int stderr_pipefds[2]; if(!daemonCore->Create_Pipe(stdin_pipefds, true, // read end registerable false, // write end not registerable false, // read end blocking false // write end blocking )) { start_err_msg = "unable to create pipe to stdin of VM gahp"; dprintf(D_ALWAYS,"%s\n", start_err_msg.Value()); return false; } if(!daemonCore->Create_Pipe(stdout_pipefds, true, //read end registerable false, // write end not registerable false, // read end blocking false // write end blocking )) { // blocking read start_err_msg = "unable to create pipe to stdout of VM gahp"; dprintf(D_ALWAYS,"%s\n", start_err_msg.Value()); return false; } if( m_include_gahp_log ) { if(!daemonCore->Create_Pipe(stderr_pipefds, true, // read end registerable false, // write end not registerable true, // read end non-blocking true // write end non-blocking )) { // nonblocking read start_err_msg = "unable to create pipe to stderr of VM gahp"; dprintf(D_ALWAYS,"%s\n", start_err_msg.Value()); return false; } } int io_redirect[3]; io_redirect[0] = stdin_pipefds[0]; //stdin gets read side of in pipe io_redirect[1] = stdout_pipefds[1]; //stdout gets write side of out pipe if( m_include_gahp_log ) { io_redirect[2] = stderr_pipefds[1]; //stderr gets write side of err pipe } else { int null_fd = safe_open_wrapper_follow(NULL_FILE, O_WRONLY | O_APPEND, 0666); if( null_fd < 0 ) { start_err_msg = "unable to open null file for stderr of VM gahp"; dprintf(D_ALWAYS,"Failed to open '%s':%s (errno %d)\n", NULL_FILE, strerror(errno), errno); return false; } io_redirect[2] = null_fd; } // Set Arguments ArgList vmgahp_args; vmgahp_args.SetArgV1SyntaxToCurrentPlatform(); vmgahp_args.AppendArg(m_vmgahp_server.Value()); // Add daemonCore options vmgahp_args.AppendArg("-f"); if( m_include_gahp_log ) { vmgahp_args.AppendArg("-t"); } vmgahp_args.AppendArg("-M"); vmgahp_args.AppendArg(VMGAHP_STANDALONE_MODE); MyString args_string; vmgahp_args.GetArgsStringForDisplay(&args_string, 1); dprintf( D_ALWAYS, "About to exec %s %s\n", JobName.Value(), args_string.Value() ); #if !defined(WIN32) uid_t vmgahp_user_uid = (uid_t) -1; gid_t vmgahp_user_gid = (gid_t) -1; if( can_switch_ids() ) { // Condor runs as root vmgahp_user_uid = get_user_uid(); vmgahp_user_gid = get_user_gid(); } else if (Starter->condorPrivSepHelper() != NULL) { vmgahp_user_uid = Starter->condorPrivSepHelper()->get_uid(); char* user_name; if (!pcache()->get_user_name(vmgahp_user_uid, user_name)) { EXCEPT("unable to get user name for UID %u", vmgahp_user_uid); } if (!pcache()->get_user_ids(user_name, vmgahp_user_uid, vmgahp_user_gid)) { EXCEPT("unable to get GID for UID %u", vmgahp_user_uid); } free(user_name); } else { // vmgahp may have setuid-root (e.g. vmgahp for Xen) vmgahp_user_uid = get_condor_uid(); vmgahp_user_gid = get_condor_gid(); } // Setup vmgahp user uid/gid if( vmgahp_user_uid > 0 ) { if( vmgahp_user_gid <= 0 ) { vmgahp_user_gid = vmgahp_user_uid; } MyString tmp_str; tmp_str.sprintf("%d", (int)vmgahp_user_uid); job_env->SetEnv("VMGAHP_USER_UID", tmp_str.Value()); tmp_str.sprintf("%d", (int)vmgahp_user_gid); job_env->SetEnv("VMGAHP_USER_GID", tmp_str.Value()); } #endif job_env->SetEnv("VMGAHP_VMTYPE", m_vm_type.Value()); job_env->SetEnv("VMGAHP_WORKING_DIR", workingdir); // Grab the full environment back out of the Env object if(IsFulldebug(D_FULLDEBUG)) { MyString env_str; job_env->getDelimitedStringForDisplay(&env_str); dprintf(D_FULLDEBUG, "Env = %s\n", env_str.Value()); } priv_state vmgahp_priv = PRIV_ROOT; #if defined(WIN32) // TODO.. // Currently vmgahp for VMware VM universe can't run as user on Windows. // It seems like a bug of VMware. VMware command line tool such as "vmrun" // requires Administrator privilege. // -jaeyoung 06/15/07 if( strcasecmp(m_vm_type.Value(), CONDOR_VM_UNIVERSE_VMWARE ) == MATCH ) { vmgahp_priv = PRIV_UNKNOWN; } #endif m_vmgahp_pid = daemonCore->Create_Process( JobName.Value(), //Name of executable vmgahp_args, //Args vmgahp_priv, //Priv state 1, //id for our registered reaper FALSE, //do not want a command port job_env, //env workingdir, //cwd family_info, //family_info NULL, //network sockets to inherit io_redirect, //redirect stdin/out/err NULL, nice_inc ); //NOTE: Create_Process() saves the errno for us if it is an //"interesting" error. char const *create_process_error = NULL; if(m_vmgahp_pid == FALSE && errno) create_process_error = strerror(errno); // Now that the VMGAHP server is running, close the sides of // the pipes we gave away to the server, and stash the ones // we want to keep in an object data member. daemonCore->Close_Pipe(io_redirect[0]); daemonCore->Close_Pipe(io_redirect[1]); if( m_include_gahp_log ) { daemonCore->Close_Pipe(io_redirect[2]); } else { close(io_redirect[2]); } if ( m_vmgahp_pid == FALSE ) { m_vmgahp_pid = -1; start_err_msg = "Failed to start vm-gahp server"; dprintf(D_ALWAYS, "%s (%s)\n", start_err_msg.Value(), m_vmgahp_server.Value()); if(create_process_error) { MyString err_msg = "Failed to execute '"; err_msg += m_vmgahp_server.Value(), err_msg += "'"; if(!args_string.IsEmpty()) { err_msg += " with arguments "; err_msg += args_string.Value(); } err_msg += ": "; err_msg += create_process_error; dprintf(D_ALWAYS, "Failed to start vmgahp server (%s)\n", err_msg.Value()); } return false; } dprintf(D_ALWAYS, "VMGAHP server pid=%d\n", m_vmgahp_pid); m_vmgahp_writefd = stdin_pipefds[1]; m_vmgahp_readfd = stdout_pipefds[0]; if( m_include_gahp_log ) { m_vmgahp_errorfd = stderr_pipefds[0]; } // Now initialization is done m_is_initialized = true; // print initial stderr messages from vmgahp printSystemErrorMsg(); // Read the initial greeting from the vm-gahp, which is the version if( command_version() == false ) { start_err_msg = "Internal vmgahp server error"; dprintf(D_ALWAYS,"Failed to read vmgahp server version\n"); printSystemErrorMsg(); cleanup(); return false; } dprintf(D_FULLDEBUG,"VMGAHP server version: %s\n", m_vmgahp_version.Value()); // Now see what commands this server supports. if( command_commands() == false ) { start_err_msg = "Internal vmgahp server error"; dprintf(D_ALWAYS,"Failed to read supported commands from vmgahp server\n"); printSystemErrorMsg(); cleanup(); return false; } // Now see what virtual machine types this server supports if( command_support_vms() == false ) { start_err_msg = "Internal vmgahp server error"; dprintf(D_ALWAYS,"Failed to read supported vm types from vmgahp server\n"); printSystemErrorMsg(); cleanup(); return false; } int result = -1; if( m_include_gahp_log ) { result = daemonCore->Register_Pipe(m_vmgahp_errorfd, "m_vmgahp_errorfd", static_cast<PipeHandlercpp>(&VMGahpServer::err_pipe_ready), "VMGahpServer::err_pipe_ready",this); if( result == -1 ) { dprintf(D_ALWAYS,"Failed to register vmgahp stderr pipe\n"); if(m_stderr_tid != -1) { daemonCore->Cancel_Timer(m_stderr_tid); m_stderr_tid = -1; } m_stderr_tid = daemonCore->Register_Timer(2, 2, (TimerHandlercpp)&VMGahpServer::err_pipe_ready, "VMGahpServer::err_pipe_ready",this); if( m_stderr_tid == -1 ) { start_err_msg = "Internal vmgahp server error"; dprintf(D_ALWAYS,"Failed to register stderr timer\n"); printSystemErrorMsg(); cleanup(); return false; } } } // try to turn on vmgahp async notification mode if ( !command_async_mode_on() ) { // not supported, set a poll interval m_is_async_mode = false; setPollInterval(m_pollInterval); } else { // command worked... register the pipe and stop polling result = daemonCore->Register_Pipe(m_vmgahp_readfd, "m_vmgahp_readfd", static_cast<PipeHandlercpp>(&VMGahpServer::pipe_ready), "VMGahpServer::pipe_ready",this); if( result == -1 ) { // failed to register the pipe for some reason; fall // back on polling (yuck). dprintf(D_ALWAYS,"Failed to register vmgahp Read pipe\n"); m_is_async_mode = false; setPollInterval(m_pollInterval); } else { // pipe is registered. stop polling. setPollInterval(0); m_is_async_mode = true; } } return true; }
// // Because we fork before calling docker, we don't actually // care if the image is stored locally or not (except to the extent that // remote image pull violates the principle of least astonishment). // int DockerAPI::run( ClassAd &machineAd, ClassAd &jobAd, const std::string & containerName, const std::string & imageID, const std::string & command, const ArgList & args, const Env & env, const std::string & sandboxPath, const std::list<std::string> extraVolumes, int & pid, int * childFDs, CondorError & /* err */ ) { gc_image(imageID); // // We currently assume that the system has been configured so that // anyone (user) who can run an HTCondor job can also run docker. It's // also apparently a security worry to run Docker as root, so let's not. // ArgList runArgs; if ( ! add_docker_arg(runArgs)) return -1; runArgs.AppendArg( "run" ); // Write out a file with the container ID. // FIXME: The startd can check this to clean up after us. // This needs to go into a directory that condor user // can write to. /* std::string cidFileName = sandboxPath + "/.cidfile"; runArgs.AppendArg( "--cidfile=" + cidFileName ); */ // Configure resource limits. // First cpus int cpus; int cpuShare; if (machineAd.LookupInteger(ATTR_CPUS, cpus)) { cpuShare = 10 * cpus; } else { cpuShare = 10; } std::string cpuShareStr; formatstr(cpuShareStr, "--cpu-shares=%d", cpuShare); runArgs.AppendArg(cpuShareStr); // Now memory int memory; // in Megabytes if (machineAd.LookupInteger(ATTR_MEMORY, memory)) { std::string mem; formatstr(mem, "--memory=%dm", memory); runArgs.AppendArg(mem); } // drop unneeded Linux capabilities if (param_boolean("DOCKER_DROP_ALL_CAPABILITIES", true /*default*/, true /*do_log*/, &machineAd, &jobAd)) { runArgs.AppendArg("--cap-drop=all"); // --no-new-privileges flag appears in docker 1.11 if (DockerAPI::majorVersion > 1 || DockerAPI::minorVersion > 10) { runArgs.AppendArg("--no-new-privileges"); } } // Give the container a useful name std::string hname = makeHostname(&machineAd, &jobAd); runArgs.AppendArg("--hostname"); runArgs.AppendArg(hname.c_str()); // Now the container name runArgs.AppendArg( "--name" ); runArgs.AppendArg( containerName ); if ( ! add_env_to_args_for_docker(runArgs, env)) { dprintf( D_ALWAYS | D_FAILURE, "Failed to pass enviroment to docker.\n" ); return -8; } // Map the external sanbox to the internal sandbox. runArgs.AppendArg( "--volume" ); runArgs.AppendArg( sandboxPath + ":" + sandboxPath ); // Now any extra volumes for (std::list<std::string>::const_iterator it = extraVolumes.begin(); it != extraVolumes.end(); it++) { runArgs.AppendArg("--volume"); std::string volume = *it; runArgs.AppendArg(volume); } // Start in the sandbox. runArgs.AppendArg( "--workdir" ); runArgs.AppendArg( sandboxPath ); // Run with the uid that condor selects for the user // either a slot user or submitting user or nobody uid_t uid = 0; uid_t gid = 0; // Docker doesn't actually run on Windows, but we compile // on Windows because... #ifndef WIN32 uid = get_user_uid(); gid = get_user_gid(); #endif if ((uid == 0) || (gid == 0)) { dprintf(D_ALWAYS|D_FAILURE, "Failed to get userid to run docker job\n"); return -9; } runArgs.AppendArg("--user"); std::string uidgidarg; formatstr(uidgidarg, "%d:%d", uid, gid); runArgs.AppendArg(uidgidarg); // Run the command with its arguments in the image. runArgs.AppendArg( imageID ); // If no command given, the default command in the image will run if (command.length() > 0) { runArgs.AppendArg( command ); } runArgs.AppendArgsFromArgList( args ); MyString displayString; runArgs.GetArgsStringForLogging( & displayString ); dprintf( D_ALWAYS, "Attempting to run: %s\n", displayString.c_str() ); // // If we run Docker attached, we avoid a race condition where // 'docker logs --follow' returns before 'docker rm' knows that the // container is gone (and refuses to remove it). Of course, we // can't block, so we have a proxy process run attached for us. // FamilyInfo fi; fi.max_snapshot_interval = param_integer( "PID_SNAPSHOT_INTERVAL", 15 ); int childPID = daemonCore->Create_Process( runArgs.GetArg(0), runArgs, PRIV_CONDOR_FINAL, 1, FALSE, FALSE, NULL, "/", & fi, NULL, childFDs ); if( childPID == FALSE ) { dprintf( D_ALWAYS | D_FAILURE, "Create_Process() failed.\n" ); return -1; } pid = childPID; return 0; }