int VanillaProc::StartJob() { dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n"); // vanilla jobs, unlike standard jobs, are allowed to run // shell scripts (or as is the case on NT, batch files). so // edit the ad so we start up a shell, pass the executable as // an argument to the shell, if we are asked to run a .bat file. #ifdef WIN32 CHAR interpreter[MAX_PATH+1], systemshell[MAX_PATH+1]; const char* jobtmp = Starter->jic->origJobName(); int joblen = strlen(jobtmp); const char *extension = joblen > 0 ? &(jobtmp[joblen-4]) : NULL; bool binary_executable = ( extension && ( MATCH == strcasecmp ( ".exe", extension ) || MATCH == strcasecmp ( ".com", extension ) ) ), java_universe = ( CONDOR_UNIVERSE_JAVA == job_universe ); ArgList arguments; MyString filename, jobname, error; if ( extension && !java_universe && !binary_executable ) { /** since we do not actually know how long the extension of the file is, we'll need to hunt down the '.' in the path, if it exists */ extension = strrchr ( jobtmp, '.' ); if ( !extension ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): Failed to extract " "the file's extension.\n" ); /** don't fail here, since we want executables to run as usual. That is, some condor jobs submit executables that do not have the '.exe' extension, but are, nonetheless, executable binaries. For instance, a submit script may contain: executable = executable$(OPSYS) */ } else { /** pull out the path to the executable */ if ( !JobAd->LookupString ( ATTR_JOB_CMD, jobname ) ) { /** fall back on Starter->jic->origJobName() */ jobname = jobtmp; } /** If we transferred the job, it may have been renamed to condor_exec.exe even though it is not an executable. Here we rename it back to a the correct extension before it will run. */ if ( MATCH == strcasecmp ( CONDOR_EXEC, condor_basename ( jobname.Value () ) ) ) { filename.formatstr ( "condor_exec%s", extension ); if (rename(CONDOR_EXEC, filename.Value()) != 0) { dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: " "failed to rename executable from %s to %s\n", CONDOR_EXEC, filename.Value() ); } } else { filename = jobname; } /** Since we've renamed our executable, we need to update the job ad to reflect this change. */ if ( !JobAd->Assign ( ATTR_JOB_CMD, filename ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "set new executable name.\n" ); return FALSE; } /** We've moved the script to argv[1], so we need to add the remaining arguments to positions argv[2].. argv[/n/]. */ if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) || !arguments.InsertArgsIntoClassAd ( JobAd, NULL, &error ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "get arguments from job ad: %s\n", error.Value () ); return FALSE; } /** Since we know already we don't want this file returned to us, we explicitly add it to an exception list which will stop the file transfer mechanism from considering it for transfer back to its submitter */ Starter->jic->removeFromOutputFiles ( filename.Value () ); } } #endif // set up a FamilyInfo structure to tell OsProc to register a family // with the ProcD in its call to DaemonCore::Create_Process // FamilyInfo fi; // take snapshots at no more than 15 seconds in between, by default // fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15); m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated(); if( ThisProcRunsAlongsideMainProc() ) { // If we track a secondary proc's family tree (such as // sshd) using the same dedicated account as the job's // family tree, we could end up killing the job when we // clean up the secondary family. m_dedicated_account = NULL; } if (m_dedicated_account) { // using login-based family tracking fi.login = m_dedicated_account; // The following message is documented in the manual as the // way to tell whether the dedicated execution account // configuration is being used. dprintf(D_ALWAYS, "Tracking process family by login \"%s\"\n", fi.login); } FilesystemRemap * fs_remap = NULL; #if defined(LINUX) // on Linux, we also have the ability to track processes via // a phony supplementary group ID // gid_t tracking_gid = 0; if (param_boolean("USE_GID_PROCESS_TRACKING", false)) { if (!can_switch_ids() && (Starter->condorPrivSepHelper() == NULL)) { EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify " "the group list of our children unless running as " "root or using PrivSep"); } fi.group_ptr = &tracking_gid; } // Increase the OOM score of this process; the child will inherit it. // This way, the job will be heavily preferred to be killed over a normal process. // OOM score is currently exponential - a score of 4 is a factor-16 increase in // the OOM score. setupOOMScore(4,800); #endif #if defined(HAVE_EXT_LIBCGROUP) // Determine the cgroup std::string cgroup_base; param(cgroup_base, "BASE_CGROUP", ""); MyString cgroup_str; const char *cgroup = NULL; /* Note on CONDOR_UNIVERSE_LOCAL - The cgroup setup code below * requires a unique name for the cgroup. It relies on * uniqueness of the MachineAd's Name * attribute. Unfortunately, in the local universe the * MachineAd (mach_ad elsewhere) is never populated, because * there is no machine. As a result the ASSERT on * starter_name fails. This means that the local universe * will not work on any machine that has BASE_CGROUP * configured. A potential workaround is to set * STARTER.BASE_CGROUP on any machine that is also running a * schedd, but that disables cgroup support from a * co-resident startd. Instead, I'm disabling cgroup support * from within the local universe until the intraction of * local universe and cgroups can be properly worked * out. -matt 7 nov '12 */ if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup_base.length()) { MyString cgroup_uniq; std::string starter_name, execute_str; param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN"); // Note: Starter is a global variable from os_proc.cpp Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name); if (starter_name.size() == 0) { char buf[16]; sprintf(buf, "%d", getpid()); starter_name = buf; } //ASSERT (starter_name.size()); cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str()); const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'}; cgroup_uniq.replaceString(dir_delim, "_"); cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR, cgroup_uniq.Value()); cgroup_str += this->CgroupSuffix(); cgroup = cgroup_str.Value(); ASSERT (cgroup != NULL); fi.cgroup = cgroup; dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup); } #endif // The chroot stuff really only works on linux #ifdef LINUX { // Have Condor manage a chroot std::string requested_chroot_name; JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name); const char * allowed_root_dirs = param("NAMED_CHROOT"); if (requested_chroot_name.size()) { dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str()); StringList chroot_list(allowed_root_dirs); chroot_list.rewind(); const char * next_chroot; bool acceptable_chroot = false; std::string requested_chroot; while ( (next_chroot=chroot_list.next()) ) { MyString chroot_spec(next_chroot); chroot_spec.Tokenize(); const char * chroot_name = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } const char * next_dir = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value()); if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) { acceptable_chroot = true; requested_chroot = next_dir; } } // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit. // Is this the responsibility of Condor to check, or the sysadmin who set it up? if (!acceptable_chroot) { return FALSE; } dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str()); std::stringstream ss; std::stringstream ss2; ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid(); std::string execute_dir = ss2.str(); ss << requested_chroot << DIR_DELIM_CHAR << ss2.str(); std::string full_dir_str = ss.str(); if (is_trivial_rootdir(requested_chroot)) { dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str()); } else if (IsDirectory(execute_dir.c_str())) { { TemporaryPrivSentry sentry(PRIV_ROOT); if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) { dprintf( D_FAILURE|D_ALWAYS, "Failed to create sandbox directory in chroot (%s): %s\n", full_dir_str.c_str(), strerror(errno) ); return FALSE; } if (chown(full_dir_str.c_str(), get_user_uid(), get_user_gid()) == -1) { EXCEPT("chown error on %s: %s", full_dir_str.c_str(), strerror(errno)); } } if (!fs_remap) { fs_remap = new FilesystemRemap(); } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str()); if (fs_remap->AddMapping(execute_dir, full_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/"); std::string root_str("/"); if (fs_remap->AddMapping(requested_chroot, root_str)) { return FALSE; } } else { dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str()); } } else { dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n"); } } // End of chroot #endif // On Linux kernel 2.4.19 and later, we can give each job its // own FS mounts. auto_free_ptr mount_under_scratch(param("MOUNT_UNDER_SCRATCH")); if (mount_under_scratch) { // try evaluating mount_under_scratch as a classad expression, if it is // an expression it must return a string. if it's not an expression, just // use it as a string (as we did before 8.3.6) classad::Value value; if (JobAd->EvaluateExpr(mount_under_scratch.ptr(), value)) { const char * pval = NULL; if (value.IsStringValue(pval)) { mount_under_scratch.set(strdup(pval)); } else { // was an expression, but not a string, so report and error and fail. dprintf(D_ALWAYS | D_ERROR, "ERROR: MOUNT_UNDER_SCRATCH does not evaluate to a string, it is : %s\n", ClassAdValueToString(value)); return FALSE; } } } // if execute dir is encrypted, add /tmp and /var/tmp to mount_under_scratch bool encrypt_execdir = false; JobAd->LookupBool(ATTR_ENCRYPT_EXECUTE_DIRECTORY,encrypt_execdir); if (encrypt_execdir || param_boolean_crufty("ENCRYPT_EXECUTE_DIRECTORY",false)) { // prepend /tmp, /var/tmp to whatever admin wanted. don't worry // if admin already listed /tmp etc - subdirs can appear twice // in this list because AddMapping() ok w/ duplicate entries MyString buf("/tmp,/var/tmp,"); buf += mount_under_scratch.ptr(); mount_under_scratch.set(buf.StrDup()); } if (mount_under_scratch) { std::string working_dir = Starter->GetWorkingDir(); if (IsDirectory(working_dir.c_str())) { StringList mount_list(mount_under_scratch); mount_list.rewind(); if (!fs_remap) { fs_remap = new FilesystemRemap(); } char * next_dir; while ( (next_dir=mount_list.next()) ) { if (!*next_dir) { // empty string? mount_list.deleteCurrent(); continue; } std::string next_dir_str(next_dir); // Gah, I wish I could throw an exception to clean up these nested if statements. if (IsDirectory(next_dir)) { char * full_dir = dirscat(working_dir, next_dir_str); if (full_dir) { std::string full_dir_str(full_dir); delete [] full_dir; full_dir = NULL; if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) { dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str()); delete fs_remap; return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str()); if (fs_remap->AddMapping(full_dir_str, next_dir_str)) { // FilesystemRemap object prints out an error message for us. delete fs_remap; return FALSE; } } else { dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str()); delete fs_remap; return FALSE; } } else { dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir); } } } else { dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str()); delete fs_remap; return FALSE; } mount_under_scratch.clear(); } #if defined(LINUX) // On Linux kernel 2.6.24 and later, we can give each // job its own PID namespace if (param_boolean("USE_PID_NAMESPACES", false)) { if (!can_switch_ids()) { EXCEPT("USE_PID_NAMESPACES enabled, but can't perform this " "call in Linux unless running as root."); } fi.want_pid_namespace = this->SupportsPIDNamespace(); if (fi.want_pid_namespace) { if (!fs_remap) { fs_remap = new FilesystemRemap(); } fs_remap->RemapProc(); } // When PID Namespaces are enabled, need to run the job // under the condor_pid_ns_init program, so that signals // propagate through to the child. // First tell the program where to log output status // via an environment variable if (param_boolean("USE_PID_NAMESPACE_INIT", true)) { Env env; MyString env_errors; MyString arg_errors; std::string filename; filename = Starter->GetWorkingDir(); filename += "/.condor_pid_ns_status"; env.MergeFrom(JobAd, &env_errors); env.SetEnv("_CONDOR_PID_NS_INIT_STATUS_FILENAME", filename); env.InsertEnvIntoClassAd(JobAd, &env_errors); Starter->jic->removeFromOutputFiles(condor_basename(filename.c_str())); this->m_pid_ns_status_filename = filename; // Now, set the job's CMD to the wrapper, and shift // over the arguments by one ArgList args; std::string cmd; JobAd->LookupString(ATTR_JOB_CMD, cmd); args.AppendArg(cmd); args.AppendArgsFromClassAd(JobAd, &arg_errors); args.InsertArgsIntoClassAd(JobAd, NULL, & arg_errors); std::string libexec; if( !param(libexec,"LIBEXEC") ) { dprintf(D_ALWAYS, "Cannot find LIBEXEC so can not run condor_pid_ns_init\n"); return 0; } std::string c_p_n_i = libexec + "/condor_pid_ns_init"; JobAd->Assign(ATTR_JOB_CMD, c_p_n_i); } } dprintf(D_FULLDEBUG, "PID namespace option: %s\n", fi.want_pid_namespace ? "true" : "false"); #endif // have OsProc start the job // int retval = OsProc::StartJob(&fi, fs_remap); if (fs_remap != NULL) { delete fs_remap; } #if defined(HAVE_EXT_LIBCGROUP) // Set fairshare limits. Note that retval == 1 indicates success, 0 is failure. // See Note near setup of param(BASE_CGROUP) if (CONDOR_UNIVERSE_LOCAL != job_universe && cgroup && retval) { std::string mem_limit; param(mem_limit, "CGROUP_MEMORY_LIMIT_POLICY", "soft"); bool mem_is_soft = mem_limit == "soft"; std::string cgroup_string = cgroup; CgroupLimits climits(cgroup_string); if (mem_is_soft || (mem_limit == "hard")) { ClassAd * MachineAd = Starter->jic->machClassAd(); int MemMb; if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) { uint64_t MemMb_big = MemMb; m_memory_limit = MemMb_big; climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft); // Note that ATTR_VIRTUAL_MEMORY on Linux // is sum of memory and swap, in Kilobytes int VMemKb; if (MachineAd->LookupInteger(ATTR_VIRTUAL_MEMORY, VMemKb)) { uint64_t memsw_limit = ((uint64_t)1024) * VMemKb; if (VMemKb > 0) { // we're not allowed to set memsw limit < // the hard memory limit. If we haven't set the hard // memory limit, the default may be infinity. // So, if we've set soft, set hard limit to memsw - one page if (mem_is_soft) { uint64_t hard_limit = memsw_limit - 4096; climits.set_memory_limit_bytes(hard_limit, false); } climits.set_memsw_limit_bytes(memsw_limit); } } else { dprintf(D_ALWAYS, "Not setting virtual memory limit in cgroup because " "Virtual Memory attribute missing in machine ad.\n"); } } else { dprintf(D_ALWAYS, "Not setting memory limit in cgroup because " "Memory attribute missing in machine ad.\n"); } } else if (mem_limit == "none") { dprintf(D_FULLDEBUG, "Not enforcing memory limit.\n"); } else { dprintf(D_ALWAYS, "Invalid value of CGROUP_MEMORY_LIMIT_POLICY: %s. Ignoring.\n", mem_limit.c_str()); } // Now, set the CPU shares ClassAd * MachineAd = Starter->jic->machClassAd(); int numCores = 1; if (MachineAd->LookupInteger(ATTR_CPUS, numCores)) { climits.set_cpu_shares(numCores*100); } else { dprintf(D_FULLDEBUG, "Invalid value of Cpus in machine ClassAd; ignoring.\n"); } setupOOMEvent(cgroup); } m_statistics.Reconfig(); // Now that the job is started, decrease the likelihood that the starter // is killed instead of the job itself. if (retval) { setupOOMScore(0,0); } #endif return retval; }
void doContactSchedd() { if (command_queue.IsEmpty()) { daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min return; } dprintf(D_FULLDEBUG,"in doContactSchedd\n"); SchedDRequest * current_command = NULL; int error=FALSE; std::string error_msg; CondorError errstack; bool do_reschedule = false; int failure_line_num = 0; int failure_errno = 0; // Try connecting to schedd DCSchedd dc_schedd ( ScheddAddr, ScheddPool ); if (dc_schedd.error() || !dc_schedd.locate()) { sprintf( error_msg, "Error locating schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); // If you can't connect return "Failure" on every job request command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0"}; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) { const char * result[] = { GAHP_RESULT_FAILURE, NULL, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } current_command->status = SchedDRequest::SDCS_COMPLETED; } } SchedDRequest::schedd_command_type commands [] = { SchedDRequest::SDC_REMOVE_JOB, SchedDRequest::SDC_HOLD_JOB, SchedDRequest::SDC_RELEASE_JOB }; const char * command_titles [] = { "REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" }; // REMOVE // HOLD // RELEASE int i=0; while (i<3) { StringList id_list; SimpleList <SchedDRequest*> this_batch; SchedDRequest::schedd_command_type this_command = commands[i]; const char * this_action = command_titles[i]; const char * this_reason = NULL; dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action); error = FALSE; // Create a batch of commands with the same command type AND the same reason command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != this_command) continue; if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0)) continue; if (this_reason == NULL) this_reason = current_command->reason; char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", current_command->cluster_id, current_command->proc_id); id_list.append (job_id_buff); this_batch.Append (current_command); } // If we haven't found any.... if (id_list.isEmpty()) { i++; continue; // ... then try the next command } // Perform the appropriate command on the current batch ClassAd * result_ad= NULL; if (this_command == SchedDRequest::SDC_REMOVE_JOB) { errstack.clear(); result_ad= dc_schedd.removeJobs ( &id_list, this_reason, &errstack); } else if (this_command == SchedDRequest::SDC_HOLD_JOB) { errstack.clear(); result_ad= dc_schedd.holdJobs ( &id_list, this_reason, NULL, &errstack); } else if (this_command == SchedDRequest::SDC_RELEASE_JOB) { errstack.clear(); result_ad= dc_schedd.releaseJobs ( &id_list, this_reason, &errstack); } else { EXCEPT( "Unexpected command type %d in doContactSchedd", this_command ); } // Analyze the result ad if (!result_ad) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s %s: %s", ScheddAddr, dc_schedd.addr(), errstack.getFullText() ); } else { result_ad->dPrint (D_FULLDEBUG); if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) { do_reschedule = true; } } // Go through the batch again, and create responses for each request this_batch.Rewind(); while (this_batch.Next(current_command)) { // Check the result char job_id_buff[30]; if (result_ad && (error == FALSE)) { sprintf (job_id_buff, "job_%d_%d", current_command->cluster_id, current_command->proc_id); int remove_result; if (result_ad->LookupInteger (job_id_buff, remove_result)) { switch (remove_result) { case AR_ERROR: error = TRUE; error_msg = "General Error"; break; case AR_SUCCESS: error = FALSE; break; case AR_NOT_FOUND: error = TRUE; error_msg = "Job not found"; break; case AR_BAD_STATUS: error = TRUE; error_msg = "Bad job status"; break; case AR_ALREADY_DONE: error = TRUE; error_msg = "Already done"; break; case AR_PERMISSION_DENIED: error = TRUE; error_msg = "Permission denied"; break; default: error = TRUE; error_msg = "Unknown Result"; } // hctiws } else { error_msg = "Unable to get result"; } // fi lookup result for job } // fi error == FALSE if (error) { dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n", this_action, current_command->cluster_id, current_command->proc_id, error_msg.c_str()); const char * result[2]; result[0] = GAHP_RESULT_FAILURE; result[1] = error_msg.c_str(); enqueue_result (current_command->request_id, result, 2); } else { dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n", this_action, current_command->cluster_id, current_command->proc_id); const char * result[2]; result[0] = GAHP_RESULT_SUCCESS; result[1] = NULL; enqueue_result (current_command->request_id, result, 2); } // fi error // Mark the status current_command->status = SchedDRequest::SDCS_COMPLETED; } // elihw this_batch if ( result_ad ) { delete result_ad; } } dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n"); // JOB_STAGE_IN int MAX_BATCH_SIZE=1; // This should be a config param SimpleList <SchedDRequest*> stage_in_batch; do { stage_in_batch.Clear(); command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN) continue; dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", current_command->cluster_id, current_command->proc_id); stage_in_batch.Append (current_command); if (stage_in_batch.Number() >= MAX_BATCH_SIZE) break; } if (stage_in_batch.Number() > 0) { ClassAd ** array = new ClassAd*[stage_in_batch.Number()]; i=0; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { array[i++] = current_command->classad; } error = FALSE; errstack.clear(); if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(), array, &errstack )) { error = TRUE; sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } delete [] array; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_IN requests } while (stage_in_batch.Number() > 0); dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n"); // JOB_STAGE_OUT SimpleList <SchedDRequest*> stage_out_batch; command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT) continue; stage_out_batch.Append (current_command); } if (stage_out_batch.Number() > 0) { std::string constraint = ""; stage_out_batch.Rewind(); int jobsexpected = stage_out_batch.Number(); while (stage_out_batch.Next(current_command)) { sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||", current_command->cluster_id, current_command->proc_id ); } constraint += "False"; error = FALSE; errstack.clear(); int jobssent; if (!dc_schedd.receiveJobSandbox( constraint.c_str(), &errstack, &jobssent )) { error = TRUE; sprintf( error_msg, "Error receiving files from schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if(error == FALSE && jobssent != jobsexpected) { error = TRUE; sprintf( error_msg, "Schedd %s didn't send expected files", ScheddAddr ); dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str()); } stage_out_batch.Rewind(); while (stage_out_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_OUT requests dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n"); CondorVersionInfo ver_info(dc_schedd.version()); bool delegate_credential; if ( ver_info.built_since_version(6,7,19) && param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) { delegate_credential = true; } else { delegate_credential = false; } // JOB_REFRESH_PROXY command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY) continue; time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad); time_t result_expiration_time = 0; bool result; errstack.clear(); if ( delegate_credential ) { result = dc_schedd.delegateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, expiration_time, &result_expiration_time, &errstack ); // Currently, we do not propagate the actual resulting // expiration time back to the gridmanager. We // probably should. } else { result = dc_schedd.updateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, &errstack ); } current_command->status = SchedDRequest::SDCS_COMPLETED; if (result == false) { sprintf( error_msg, "Error refreshing proxy to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); const char * result_to_queue[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result_to_queue, 2); } else { const char * result_to_queue[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result_to_queue, 2); } } // Now do all the QMGMT transactions error = FALSE; // Try connecting to the queue Qmgr_connection * qmgr_connection; if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else { errno = 0; AbortTransaction(); // Just so we can call BeginTransaction() in the loop if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n"); // UPDATE_CONSTRAINED // UDATE_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) && (current_command->command != SchedDRequest::SDC_UPDATE_JOB)) continue; if (qmgr_connection == NULL) goto update_report_result; error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else { if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) { if( SetAttributeByConstraint(current_command->constraint, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s", errno, lhstr, rhstr, current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) { if( SetAttribute(current_command->cluster_id, current_command->proc_id, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d", lhstr, rhstr, current_command->cluster_id, current_command->proc_id); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } } if (error) break; } // elihw classad update_report_result: if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n"); // UPDATE_LEASE command_queue.Rewind(); while (command_queue.Next(current_command)) { error = FALSE; if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE) continue; std::string success_job_ids=""; if (qmgr_connection == NULL) { sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); error = TRUE; } else { error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } for (i=0; i<current_command->num_jobs; i++) { time_t time_now = time(NULL); int duration = current_command->expirations[i].expiration - time_now; dprintf (D_FULLDEBUG, "Job %d.%d SetTimerAttribute=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, duration); if (SetTimerAttribute (current_command->expirations[i].cluster, current_command->expirations[i].proc, ATTR_TIMER_REMOVE_CHECK, duration) < 0) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } dprintf (D_ALWAYS, "Unable to SetTimerAttribute(%d, %d), errno=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, errno); } else { // Append job id to the result line if (success_job_ids.length() > 0) success_job_ids += ","; sprintf_cat( success_job_ids, "%d.%d", current_command->expirations[i].cluster, current_command->expirations[i].proc); } } //rof jobs for request } // fi error if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL, success_job_ids.length()?success_job_ids.c_str():NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw UPDATE_LEASE requests dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n"); // SUBMIT_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB) continue; int ClusterId = -1; int ProcId = -1; if (qmgr_connection == NULL) { error = TRUE; goto submit_report_result; } errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } error = FALSE; if ((ClusterId = NewCluster()) >= 0) { ProcId = NewProc (ClusterId); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } if ( ClusterId < 0 ) { error = TRUE; error_msg = "Unable to create a new job cluster"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else if ( ProcId < 0 ) { error = TRUE; error_msg = "Unable to create a new job proc"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if ( ClusterId == -2 || ProcId == -2 ) { error = TRUE; error_msg = "Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } // Adjust the argument/environment syntax based on the version // of the schedd we are talking to. if( error == FALSE) { CondorVersionInfo version_info(dc_schedd.version()); ArgList arglist; MyString arg_error_msg; Env env_obj; MyString env_error_msg; if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) || ! arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg)) { sprintf( error_msg, "ERROR: ClassAd problem in converting arguments to syntax " "for schedd (version=%s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", arg_error_msg.Value()); dprintf( D_ALWAYS,"%s\n", error_msg.c_str() ); error = TRUE; } if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) || !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info)) { sprintf( error_msg, "ERROR: Failed to convert environment to target syntax" " for schedd (version %s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", env_error_msg.Value()); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } if( error == FALSE ) { // See the comment in the function body of ExpandInputFileList // for an explanation of what is going on here. MyString transfer_input_error_msg; if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) { dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() ); error = TRUE; } } if ( error == FALSE ) { current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId); current_command->classad->Assign(ATTR_PROC_ID, ProcId); // Special case for the job lease int expire_time; if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) { if ( SetTimerAttribute( ClusterId, ProcId, ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL) ) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d", ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; goto submit_report_result; } current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK ); } // Set all the classad attribute on the remote classad current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else if( SetAttribute (ClusterId, ProcId, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d", lhstr, rhstr, ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } if (error) break; } // elihw classad } // fi error==FALSE submit_report_result: char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", ClusterId, ProcId); if (error) { const char * result[] = { GAHP_RESULT_FAILURE, job_id_buff, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } current_command->status = SchedDRequest::SDCS_COMPLETED; } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, job_id_buff, NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } // elihw dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n"); // STATUS_CONSTRAINED command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED) continue; if (qmgr_connection != NULL) { SimpleList <MyString *> matching_ads; error = FALSE; ClassAd *next_ad; ClassAdList adlist; // Only use GetAllJobsByConstraint if remote schedd is // 6.9.5 or newer. Previous versions either did not // support this call, or they closed the Qmgmt connection // as a side-effect of this call. if( ver_info.built_since_version(6,9,5) ) { dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n", current_command->constraint ); // NOTE: this could be made more efficient if we knew // the list of attributes to query. For lack of that, // we just get all attributes. GetAllJobsByConstraint( current_command->constraint, "", adlist); } else { // This is the old latency-prone method. dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n", current_command->constraint ); next_ad = GetNextJobByConstraint( current_command->constraint, 1 ); while( next_ad != NULL ) { adlist.Insert( next_ad ); next_ad = GetNextJobByConstraint( current_command->constraint, 0 ); } } // NOTE: ClassAdList will deallocate the ClassAds in it adlist.Rewind(); while( (next_ad=adlist.Next()) ) { MyString * da_buffer = new MyString(); // Use a ptr to avoid excessive copying if ( useXMLClassads ) { ClassAdXMLUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } else { NewClassAdUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } matching_ads.Append (da_buffer); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } // now output this list of classads into a result const char ** result = new const char* [matching_ads.Length() + 3]; std::string _ad_count; sprintf( _ad_count, "%d", matching_ads.Length() ); int count=0; result[count++] = GAHP_RESULT_SUCCESS; result[count++] = NULL; result[count++] = _ad_count.c_str(); MyString *next_string; matching_ads.Rewind(); while (matching_ads.Next(next_string)) { result[count++] = next_string->Value(); } enqueue_result (current_command->request_id, result, count); current_command->status = SchedDRequest::SDCS_COMPLETED; // Cleanup matching_ads.Rewind(); while (matching_ads.Next(next_string)) { delete next_string; } //CommitTransaction(); delete [] result; } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } //elihw contact_schedd_disconnect: if ( qmgr_connection != NULL ) { DisconnectQ (qmgr_connection, FALSE); } if ( failure_line_num ) { // We had an error talking to the schedd. Take all of our // incomplete commands and mark them as failed. // TODO Consider retrying these commands, rather than // immediately marking them as failed. if ( failure_errno == ETIMEDOUT ) { dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in " "doContactSchedd()\n", failure_line_num ); sprintf( error_msg, "Timed out talking to schedd" ); } else { dprintf( D_ALWAYS, "Error talking to schedd at line %d in " "doContactSchedd(), errno=%d (%s)\n", failure_line_num, failure_errno, strerror(failure_errno) ); sprintf( error_msg, "Error talking to schedd" ); } command_queue.Rewind(); while (command_queue.Next(current_command)) { if ( current_command->status != SchedDRequest::SDCS_NEW ) { continue; } switch( current_command->command ) { case SchedDRequest::SDC_UPDATE_JOB: case SchedDRequest::SDC_UPDATE_CONSTRAINED: { const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_UPDATE_LEASE: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_SUBMIT_JOB: { const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_STATUS_CONSTRAINED: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; default: // Do nothing ; } } } if ( do_reschedule ) { dc_schedd.reschedule(); } // Write all of our results to our parent. flush_results(); dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n"); // Clean up the list command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status == SchedDRequest::SDCS_COMPLETED) { command_queue.DeleteCurrent(); delete current_command; } } // Come back soon.. // QUESTION: Should this always be a fixed time period? daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); }
int VanillaProc::StartJob() { dprintf(D_FULLDEBUG,"in VanillaProc::StartJob()\n"); // vanilla jobs, unlike standard jobs, are allowed to run // shell scripts (or as is the case on NT, batch files). so // edit the ad so we start up a shell, pass the executable as // an argument to the shell, if we are asked to run a .bat file. #ifdef WIN32 CHAR interpreter[MAX_PATH+1], systemshell[MAX_PATH+1]; const char* jobtmp = Starter->jic->origJobName(); int joblen = strlen(jobtmp); const char *extension = joblen > 0 ? &(jobtmp[joblen-4]) : NULL; bool binary_executable = ( extension && ( MATCH == strcasecmp ( ".exe", extension ) || MATCH == strcasecmp ( ".com", extension ) ) ), java_universe = ( CONDOR_UNIVERSE_JAVA == job_universe ); ArgList arguments; MyString filename, jobname, error; if ( extension && !java_universe && !binary_executable ) { /** since we do not actually know how long the extension of the file is, we'll need to hunt down the '.' in the path, if it exists */ extension = strrchr ( jobtmp, '.' ); if ( !extension ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): Failed to extract " "the file's extension.\n" ); /** don't fail here, since we want executables to run as usual. That is, some condor jobs submit executables that do not have the '.exe' extension, but are, nonetheless, executable binaries. For instance, a submit script may contain: executable = executable$(OPSYS) */ } else { /** pull out the path to the executable */ if ( !JobAd->LookupString ( ATTR_JOB_CMD, jobname ) ) { /** fall back on Starter->jic->origJobName() */ jobname = jobtmp; } /** If we transferred the job, it may have been renamed to condor_exec.exe even though it is not an executable. Here we rename it back to a the correct extension before it will run. */ if ( MATCH == strcasecmp ( CONDOR_EXEC, condor_basename ( jobname.Value () ) ) ) { filename.formatstr ( "condor_exec%s", extension ); if (rename(CONDOR_EXEC, filename.Value()) != 0) { dprintf (D_ALWAYS, "VanillaProc::StartJob(): ERROR: " "failed to rename executable from %s to %s\n", CONDOR_EXEC, filename.Value() ); } } else { filename = jobname; } /** Since we've renamed our executable, we need to update the job ad to reflect this change. */ if ( !JobAd->Assign ( ATTR_JOB_CMD, filename ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "set new executable name.\n" ); return FALSE; } /** We've moved the script to argv[1], so we need to add the remaining arguments to positions argv[2].. argv[/n/]. */ if ( !arguments.AppendArgsFromClassAd ( JobAd, &error ) || !arguments.InsertArgsIntoClassAd ( JobAd, NULL, &error ) ) { dprintf ( D_ALWAYS, "VanillaProc::StartJob(): ERROR: failed to " "get arguments from job ad: %s\n", error.Value () ); return FALSE; } /** Since we know already we don't want this file returned to us, we explicitly add it to an exception list which will stop the file transfer mechanism from considering it for transfer back to its submitter */ Starter->jic->removeFromOutputFiles ( filename.Value () ); } } #endif // set up a FamilyInfo structure to tell OsProc to register a family // with the ProcD in its call to DaemonCore::Create_Process // FamilyInfo fi; // take snapshots at no more than 15 seconds in between, by default // fi.max_snapshot_interval = param_integer("PID_SNAPSHOT_INTERVAL", 15); m_dedicated_account = Starter->jic->getExecuteAccountIsDedicated(); if( ThisProcRunsAlongsideMainProc() ) { // If we track a secondary proc's family tree (such as // sshd) using the same dedicated account as the job's // family tree, we could end up killing the job when we // clean up the secondary family. m_dedicated_account = NULL; } if (m_dedicated_account) { // using login-based family tracking fi.login = m_dedicated_account; // The following message is documented in the manual as the // way to tell whether the dedicated execution account // configuration is being used. dprintf(D_ALWAYS, "Tracking process family by login \"%s\"\n", fi.login); } FilesystemRemap * fs_remap = NULL; #if defined(LINUX) // on Linux, we also have the ability to track processes via // a phony supplementary group ID // gid_t tracking_gid = 0; if (param_boolean("USE_GID_PROCESS_TRACKING", false)) { if (!can_switch_ids() && (Starter->condorPrivSepHelper() == NULL)) { EXCEPT("USE_GID_PROCESS_TRACKING enabled, but can't modify " "the group list of our children unless running as " "root or using PrivSep"); } fi.group_ptr = &tracking_gid; } #endif #if defined(HAVE_EXT_LIBCGROUP) // Determine the cgroup std::string cgroup_base; param(cgroup_base, "BASE_CGROUP", ""); MyString cgroup_str; const char *cgroup = NULL; if (cgroup_base.length()) { MyString cgroup_uniq; std::string starter_name, execute_str; param(execute_str, "EXECUTE", "EXECUTE_UNKNOWN"); // Note: Starter is a global variable from os_proc.cpp Starter->jic->machClassAd()->EvalString(ATTR_NAME, NULL, starter_name); ASSERT (starter_name.size()); cgroup_uniq.formatstr("%s_%s", execute_str.c_str(), starter_name.c_str()); const char dir_delim[2] = {DIR_DELIM_CHAR, '\0'}; cgroup_uniq.replaceString(dir_delim, "_"); cgroup_str.formatstr("%s%ccondor%s", cgroup_base.c_str(), DIR_DELIM_CHAR, cgroup_uniq.Value()); cgroup = cgroup_str.Value(); ASSERT (cgroup != NULL); fi.cgroup = cgroup; dprintf(D_FULLDEBUG, "Requesting cgroup %s for job.\n", cgroup); } #endif // The chroot stuff really only works on linux #ifdef LINUX { // Have Condor manage a chroot std::string requested_chroot_name; JobAd->EvalString("RequestedChroot", NULL, requested_chroot_name); const char * allowed_root_dirs = param("NAMED_CHROOT"); if (requested_chroot_name.size()) { dprintf(D_FULLDEBUG, "Checking for chroot: %s\n", requested_chroot_name.c_str()); StringList chroot_list(allowed_root_dirs); chroot_list.rewind(); const char * next_chroot; bool acceptable_chroot = false; std::string requested_chroot; while ( (next_chroot=chroot_list.next()) ) { MyString chroot_spec(next_chroot); chroot_spec.Tokenize(); const char * chroot_name = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } const char * next_dir = chroot_spec.GetNextToken("=", false); if (chroot_name == NULL) { dprintf(D_ALWAYS, "Invalid named chroot: %s\n", chroot_spec.Value()); } dprintf(D_FULLDEBUG, "Considering directory %s for chroot %s.\n", next_dir, chroot_spec.Value()); if (IsDirectory(next_dir) && chroot_name && (strcmp(requested_chroot_name.c_str(), chroot_name) == 0)) { acceptable_chroot = true; requested_chroot = next_dir; } } // TODO: path to chroot MUST be all root-owned, or we have a nice security exploit. // Is this the responsibility of Condor to check, or the sysadmin who set it up? if (!acceptable_chroot) { return FALSE; } dprintf(D_FULLDEBUG, "Will attempt to set the chroot to %s.\n", requested_chroot.c_str()); std::stringstream ss; std::stringstream ss2; ss2 << Starter->GetExecuteDir() << DIR_DELIM_CHAR << "dir_" << getpid(); std::string execute_dir = ss2.str(); ss << requested_chroot << DIR_DELIM_CHAR << ss2.str(); std::string full_dir_str = ss.str(); if (is_trivial_rootdir(requested_chroot)) { dprintf(D_FULLDEBUG, "Requested a trivial chroot %s; this is a no-op.\n", requested_chroot.c_str()); } else if (IsDirectory(execute_dir.c_str())) { { TemporaryPrivSentry sentry(PRIV_ROOT); if( mkdir(full_dir_str.c_str(), S_IRWXU) < 0 ) { dprintf( D_FAILURE|D_ALWAYS, "Failed to create sandbox directory in chroot (%s): %s\n", full_dir_str.c_str(), strerror(errno) ); return FALSE; } if (chown(full_dir_str.c_str(), get_user_uid(), get_user_gid()) == -1) { EXCEPT("chown error on %s: %s", full_dir_str.c_str(), strerror(errno)); } } if (!fs_remap) { fs_remap = new FilesystemRemap(); } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", execute_dir.c_str(), full_dir_str.c_str()); if (fs_remap->AddMapping(execute_dir, full_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping %s -> %s.\n", requested_chroot.c_str(), "/"); std::string root_str("/"); if (fs_remap->AddMapping(requested_chroot, root_str)) { return FALSE; } } else { dprintf(D_ALWAYS, "Unable to do chroot because working dir %s does not exist.\n", execute_dir.c_str()); } } else { dprintf(D_FULLDEBUG, "Value of RequestedChroot is unset.\n"); } } // End of chroot #endif // On Linux kernel 2.4.19 and later, we can give each job its // own FS mounts. char * mount_under_scratch = param("MOUNT_UNDER_SCRATCH"); if (mount_under_scratch) { std::string working_dir = Starter->GetWorkingDir(); if (IsDirectory(working_dir.c_str())) { StringList mount_list(mount_under_scratch); free(mount_under_scratch); mount_list.rewind(); if (!fs_remap) { fs_remap = new FilesystemRemap(); } char * next_dir; while ( (next_dir=mount_list.next()) ) { if (!*next_dir) { // empty string? mount_list.deleteCurrent(); continue; } std::string next_dir_str(next_dir); // Gah, I wish I could throw an exception to clean up these nested if statements. if (IsDirectory(next_dir)) { char * full_dir = dirscat(working_dir, next_dir_str); if (full_dir) { std::string full_dir_str(full_dir); delete [] full_dir; full_dir = NULL; if (!mkdir_and_parents_if_needed( full_dir_str.c_str(), S_IRWXU, PRIV_USER )) { dprintf(D_ALWAYS, "Failed to create scratch directory %s\n", full_dir_str.c_str()); return FALSE; } dprintf(D_FULLDEBUG, "Adding mapping: %s -> %s.\n", full_dir_str.c_str(), next_dir_str.c_str()); if (fs_remap->AddMapping(full_dir_str, next_dir_str)) { // FilesystemRemap object prints out an error message for us. return FALSE; } } else { dprintf(D_ALWAYS, "Unable to concatenate %s and %s.\n", working_dir.c_str(), next_dir_str.c_str()); return FALSE; } } else { dprintf(D_ALWAYS, "Unable to add mapping %s -> %s because %s doesn't exist.\n", working_dir.c_str(), next_dir, next_dir); } } } else { dprintf(D_ALWAYS, "Unable to perform mappings because %s doesn't exist.\n", working_dir.c_str()); return FALSE; } } // have OsProc start the job // int retval = OsProc::StartJob(&fi, fs_remap); if (fs_remap != NULL) { delete fs_remap; } #if defined(HAVE_EXT_LIBCGROUP) // Set fairshare limits. Note that retval == 1 indicates success, 0 is failure. if (cgroup && retval) { std::string mem_limit; param(mem_limit, "MEMORY_LIMIT", "soft"); bool mem_is_soft = mem_limit == "soft"; std::string cgroup_string = cgroup; CgroupLimits climits(cgroup_string); if (mem_is_soft || (mem_limit == "hard")) { ClassAd * MachineAd = Starter->jic->machClassAd(); int MemMb; if (MachineAd->LookupInteger(ATTR_MEMORY, MemMb)) { uint64_t MemMb_big = MemMb; climits.set_memory_limit_bytes(1024*1024*MemMb_big, mem_is_soft); } else { dprintf(D_ALWAYS, "Not setting memory soft limit in cgroup because " "Memory attribute missing in machine ad.\n"); } } else if (mem_limit == "none") { dprintf(D_FULLDEBUG, "Not enforcing memory soft limit.\n"); } else { dprintf(D_ALWAYS, "Invalid value of MEMORY_LIMIT: %s. Ignoring.\n", mem_limit.c_str()); } // Now, set the CPU shares ClassAd * MachineAd = Starter->jic->machClassAd(); int slotWeight; if (MachineAd->LookupInteger(ATTR_SLOT_WEIGHT, slotWeight)) { climits.set_cpu_shares(slotWeight*100); } else { dprintf(D_FULLDEBUG, "Invalid value of SlotWeight in machine ClassAd; ignoring.\n"); } } #endif return retval; }
int JavaProc::StartJob() { MyString java_cmd; char* jarfiles = NULL; ArgList args; MyString arg_buf; // Since we are adding to the argument list, we may need to deal // with platform-specific arg syntax in the user's args in order // to successfully merge them with the additional java VM args. args.SetArgV1SyntaxToCurrentPlatform(); // Construct the list of jar files for the command line // If a jar file is transferred locally, use its local name // (in the execute directory) // otherwise use the original name StringList jarfiles_orig_list; StringList jarfiles_local_list; StringList* jarfiles_final_list = NULL; if( JobAd->LookupString(ATTR_JAR_FILES,&jarfiles) ) { jarfiles_orig_list.initializeFromString( jarfiles ); free( jarfiles ); jarfiles = NULL; char * jarfile_name; const char * base_name; struct stat stat_buff; if( Starter->jic->iwdIsChanged() ) { // If the job's IWD has been changed (because we're // running in the sandbox due to file transfer), we // need to use a local version of the path to the jar // files, not the full paths from the submit machine. jarfiles_orig_list.rewind(); while( (jarfile_name = jarfiles_orig_list.next()) ) { // Construct the local name base_name = condor_basename( jarfile_name ); MyString local_name = execute_dir; local_name += DIR_DELIM_CHAR; local_name += base_name; if( stat(local_name.Value(), &stat_buff) == 0 ) { // Jar file exists locally, use local name jarfiles_local_list.append( local_name.Value() ); } else { // Use the original name jarfiles_local_list.append (jarfile_name); } } // while(jarfiles_orig_list) // jarfiles_local_list is our real copy... jarfiles_final_list = &jarfiles_local_list; } else { // !iwdIsChanged() // just use jarfiles_orig_list as our real copy... jarfiles_final_list = &jarfiles_orig_list; } } startfile.formatstr("%s%cjvm.start",execute_dir,DIR_DELIM_CHAR); endfile.formatstr("%s%cjvm.end",execute_dir,DIR_DELIM_CHAR); if( !java_config(java_cmd,&args,jarfiles_final_list) ) { dprintf(D_FAILURE|D_ALWAYS,"JavaProc: Java is not configured!\n"); return 0; } JobAd->Assign(ATTR_JOB_CMD, java_cmd.Value()); arg_buf.formatstr("-Dchirp.config=%s%cchirp.config",execute_dir,DIR_DELIM_CHAR); args.AppendArg(arg_buf.Value()); char *jvm_args1 = NULL; char *jvm_args2 = NULL; MyString jvm_args_error; bool jvm_args_success = true; JobAd->LookupString(ATTR_JOB_JAVA_VM_ARGS1, &jvm_args1); JobAd->LookupString(ATTR_JOB_JAVA_VM_ARGS2, &jvm_args2); if(jvm_args2) { jvm_args_success = args.AppendArgsV2Raw(jvm_args2, &jvm_args_error); } else if(jvm_args1) { jvm_args_success = args.AppendArgsV1Raw(jvm_args1, &jvm_args_error); } free(jvm_args1); free(jvm_args2); if (!jvm_args_success) { dprintf(D_ALWAYS, "JavaProc: failed to parse JVM args: %s\n", jvm_args_error.Value()); return 0; } args.AppendArg("CondorJavaWrapper"); args.AppendArg(startfile.Value()); args.AppendArg(endfile.Value()); MyString args_error; if(!args.AppendArgsFromClassAd(JobAd,&args_error)) { dprintf(D_ALWAYS,"JavaProc: failed to read job arguments: %s\n", args_error.Value()); return 0; } // We are just talking to ourselves, so it is fine to use argument // syntax compatible with this current version of Condor. CondorVersionInfo ver_info; if(!args.InsertArgsIntoClassAd(JobAd,&ver_info,&args_error)) { dprintf(D_ALWAYS,"JavaProc: failed to insert java job arguments: %s\n", args_error.Value()); return 0; } dprintf(D_ALWAYS,"JavaProc: Cmd=%s\n",java_cmd.Value()); MyString args_string; args.GetArgsStringForDisplay(&args_string); dprintf(D_ALWAYS,"JavaProc: Args=%s\n",args_string.Value()); return VanillaProc::StartJob(); }