int GLExecPrivSepHelper::create_process(const char* path, ArgList& args, Env& env, const char* iwd, int job_std_fds[3], const char* std_file_names[3], int nice_inc, size_t* core_size_ptr, int reaper_id, int dc_job_opts, FamilyInfo* family_info, int *, MyString *error_msg) { ASSERT(m_initialized); if (!proxy_valid_right_now()) { dprintf(D_ALWAYS, "GLExecPrivSepHelper::create_process: not invoking glexec since the proxy is not valid!\n"); if( error_msg ) { error_msg->formatstr_cat("The job proxy is invalid."); } return -1; } // make a copy of std FDs so we're not messing w/ our caller's // memory int std_fds[3] = {-1, -1, -1}; ArgList modified_args; modified_args.AppendArg(m_run_script); modified_args.AppendArg(m_glexec); modified_args.AppendArg(m_proxy); modified_args.AppendArg(m_sandbox); modified_args.AppendArg(m_wrapper_script); for (int i = 0; i < 3; i++) { modified_args.AppendArg((job_std_fds == NULL || job_std_fds[i] == -1) ? std_file_names[i] : "-"); } modified_args.AppendArg(path); for (int i = 1; i < args.Count(); i++) { modified_args.AppendArg(args.GetArg(i)); } int glexec_errors = 0; while(1) { // setup a UNIX domain socket for communicating with // condor_glexec_wrapper (see comment above feed_wrapper() // for details // int sock_fds[2]; if (socketpair(PF_UNIX, SOCK_STREAM, 0, sock_fds) == -1) { dprintf(D_ALWAYS, "GLEXEC: socketpair error: %s\n", strerror(errno)); return false; } std_fds[0] = sock_fds[1]; // now create a pipe for receiving diagnostic stdout/stderr from glexec int glexec_out_fds[2]; if (pipe(glexec_out_fds) < 0) { dprintf(D_ALWAYS, "GLEXEC: pipe() error: %s\n", strerror(errno)); close(sock_fds[0]); close(sock_fds[1]); return false; } std_fds[1] = glexec_out_fds[1]; std_fds[2] = std_fds[1]; // collect glexec stderr and stdout together FamilyInfo fi; FamilyInfo* fi_ptr = (family_info != NULL) ? family_info : &fi; MyString proxy_path; proxy_path.formatstr("%s.condor/%s", m_sandbox, m_proxy); fi_ptr->glexec_proxy = proxy_path.Value(); // At the very least, we need to pass the condor daemon's // X509_USER_PROXY to condor_glexec_run. Currently, we just // pass all daemon environment. We do _not_ run // condor_glexec_run in the job environment, because that // would be a security risk and would serve no purpose, since // glexec cleanses the environment anyway. dc_job_opts &= ~(DCJOBOPT_NO_ENV_INHERIT); int pid = daemonCore->Create_Process(m_run_script.Value(), modified_args, PRIV_USER_FINAL, reaper_id, FALSE, FALSE, NULL, iwd, fi_ptr, NULL, std_fds, NULL, nice_inc, NULL, dc_job_opts, core_size_ptr, NULL, NULL, error_msg); // close our handle to glexec's end of the diagnostic output pipe close(glexec_out_fds[1]); MyString glexec_error_msg; int glexec_rc = 0; int ret_val = feed_wrapper(pid, sock_fds, env, dc_job_opts, job_std_fds, glexec_out_fds[0], &glexec_error_msg, &glexec_rc); // if not closed in feed_wrapper, close the glexec error pipe now if( glexec_out_fds[0] != -1 ) { close(glexec_out_fds[0]); } // Unlike the other glexec operations where we handle // glexec retry inside the helper script, // condor_glexec_run cannot handle retry for us, because // it must exec glexec rather than spawning it off in a // new process. Therefore, we handle here retries in case // of transient errors. if( ret_val != 0 ) { return ret_val; // success } bool retry = true; if( glexec_rc != 202 && glexec_rc != 203 ) { // Either a non-transient glexec error, or some other // non-glexec error. retry = false; } else { // This _could_ be a transient glexec issue, such as a // communication error with GUMS, so retry up to some // limit. glexec_errors += 1; if( glexec_errors > m_glexec_retries ) { retry = false; } } if( !retry ) { // return the most recent glexec error output if( error_msg ) { error_msg->formatstr_cat(glexec_error_msg.Value()); } return 0; } // truncated exponential backoff int delay_rand = 1 + (get_random_int() % glexec_errors) % 100; int delay = m_glexec_retry_delay * delay_rand; dprintf(D_ALWAYS,"Glexec exited with status %d; retrying in %d seconds.\n", glexec_rc, delay ); sleep(delay); // now try again ... } // should never get here return 0; }
int GLExecPrivSepHelper::create_process(const char* path, ArgList& args, Env& env, const char* iwd, int job_std_fds[3], const char* std_file_names[3], int nice_inc, size_t* core_size_ptr, int reaper_id, int dc_job_opts, FamilyInfo* family_info, int *, MyString *error_msg) { ASSERT(m_initialized); if (!proxy_valid_right_now()) { dprintf(D_ALWAYS, "GLExecPrivSepHelper::create_process: not invoking glexec since the proxy is not valid!\n"); return -1; } // make a copy of std FDs so we're not messing w/ our caller's // memory int std_fds[3] = {-1, -1, -1}; ArgList modified_args; modified_args.AppendArg(m_run_script); modified_args.AppendArg(m_glexec); modified_args.AppendArg(m_proxy); modified_args.AppendArg(m_sandbox); modified_args.AppendArg(m_wrapper_script); for (int i = 0; i < 3; i++) { modified_args.AppendArg((job_std_fds == NULL || job_std_fds[i] == -1) ? std_file_names[i] : "-"); } modified_args.AppendArg(path); for (int i = 1; i < args.Count(); i++) { modified_args.AppendArg(args.GetArg(i)); } // setup a UNIX domain socket for communicating with // condor_glexec_wrapper (see comment above feed_wrapper() // for details // int sock_fds[2]; if (socketpair(PF_UNIX, SOCK_STREAM, 0, sock_fds) == -1) { dprintf(D_ALWAYS, "GLEXEC: socketpair error: %s\n", strerror(errno)); return false; } std_fds[0] = sock_fds[1]; // now create a pipe for receiving diagnostic stdout/stderr from glexec int glexec_out_fds[2]; if (pipe(glexec_out_fds) < 0) { dprintf(D_ALWAYS, "GLEXEC: pipe() error: %s\n", strerror(errno)); close(sock_fds[0]); close(sock_fds[1]); return false; } std_fds[1] = glexec_out_fds[1]; std_fds[2] = std_fds[1]; // collect glexec stderr and stdout together FamilyInfo fi; FamilyInfo* fi_ptr = (family_info != NULL) ? family_info : &fi; MyString proxy_path; proxy_path.formatstr("%s.condor/%s", m_sandbox, m_proxy); fi_ptr->glexec_proxy = proxy_path.Value(); // At the very least, we need to pass the condor daemon's // X509_USER_PROXY to condor_glexec_run. Currently, we just // pass all daemon environment. We do _not_ run // condor_glexec_run in the job environment, because that // would be a security risk and would serve no purpose, since // glexec cleanses the environment anyway. dc_job_opts &= ~(DCJOBOPT_NO_ENV_INHERIT); int pid = daemonCore->Create_Process(m_run_script.Value(), modified_args, PRIV_USER_FINAL, reaper_id, FALSE, NULL, iwd, fi_ptr, NULL, std_fds, NULL, nice_inc, NULL, dc_job_opts, core_size_ptr, NULL, NULL, error_msg); // close our handle to glexec's end of the diagnostic output pipe close(glexec_out_fds[1]); int ret_val = feed_wrapper(pid, sock_fds, env, dc_job_opts, job_std_fds, glexec_out_fds[0], error_msg); // if not closed in feed_wrapper, close the glexec error pipe now if( glexec_out_fds[0] != -1 ) { close(glexec_out_fds[0]); } return ret_val; }