bool XInterface::TryUser(const char *user) { static char env[1024]; static bool need_uninit = false; passwd *passwd_entry; passwd_entry = getpwnam(user); if(passwd_entry == NULL) { // We couldn't find the current user in the passwd file? dprintf( D_FULLDEBUG, "Current user cannot be found in passwd file.\n" ); return false; } else { sprintf(env, "XAUTHORITY=%s/.Xauthority", passwd_entry->pw_dir); if(putenv(env) != 0) { EXCEPT("Putenv failed!."); } } if ( need_uninit ) { uninit_user_ids(); need_uninit = false; } // passing "root" to init_user_ids is fatal if (strcmp(user, "root") == 0) { set_root_priv(); } else { init_user_ids( user, NULL ); set_user_priv(); need_uninit = true; } dprintf( D_FULLDEBUG, "Using %s's .Xauthority: \n", passwd_entry->pw_name ); return true; }
int JobRouterHookMgr::hookJobCleanup(RoutedJob* r_job) { ClassAd temp_ad; char* hook_cleanup = getHookPath(HOOK_JOB_CLEANUP, r_job->src_ad); if (NULL == hook_cleanup) { // hook not defined dprintf(D_FULLDEBUG, "HOOK_JOB_CLEANUP not configured.\n"); return 0; } if (0 >= r_job->dest_ad.size()) { return 0; } // Verify the cleanup hook hasn't already been spawned and that // we're not waiting for it to return. std::string key = r_job->dest_key; if (true == JobRouterHookMgr::checkHookKnown(key.c_str(), HOOK_JOB_CLEANUP)) { dprintf(D_FULLDEBUG, "JobRouterHookMgr::hookJobCleanup " "retried while still waiting for cleanup hook to " "return for job with key %s - ignoring\n", key.c_str()); return 1; } temp_ad = r_job->dest_ad; MyString hook_stdin; temp_ad.sPrint(hook_stdin); CleanupClient* cleanup_client = new CleanupClient(hook_cleanup, r_job); if (NULL == cleanup_client) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookJobCleanup: " "failed to create status update client\n"); return -1; } set_user_from_ad(r_job->src_ad); if (0 == spawn(cleanup_client, NULL, &hook_stdin, PRIV_USER_FINAL)) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::JobCleanup: " "failed to spawn HOOK_JOB_CLEANUP (%s)\n", hook_cleanup); delete cleanup_client; return -1; } uninit_user_ids(); // Add our info to the list of hooks currently running for this job. if (false == JobRouterHookMgr::addKnownHook(key.c_str(), HOOK_JOB_CLEANUP)) { dprintf(D_ALWAYS, "ERROR in JobRouterHookMgr::hookJobCleanup: " "failed to add HOOK_JOB_CLEANUP to list of " "hooks running for job key %s\n", key.c_str()); } dprintf(D_FULLDEBUG, "HOOK_JOB_CLEANUP (%s) invoked.\n", hook_cleanup); return 1; }
int JobRouterHookMgr::hookJobExit(RoutedJob* r_job) { ClassAd temp_ad; char* hook_job_exit = getHookPath(HOOK_JOB_EXIT, r_job->src_ad); if (NULL == hook_job_exit) { // hook not defined dprintf(D_FULLDEBUG, "HOOK_JOB_EXIT not configured.\n"); return 0; } // Verify the exit hook hasn't already been spawned and that // we're not waiting for it to return. std::string key = r_job->dest_key; if (true == JobRouterHookMgr::checkHookKnown(key.c_str(),HOOK_JOB_EXIT)) { dprintf(D_FULLDEBUG, "JobRouterHookMgr::hookJobExit " "retried while still waiting for exit hook to return " "for job with key %s - ignoring\n", key.c_str()); return 1; } temp_ad = r_job->src_ad; MyString hook_stdin; temp_ad.sPrint(hook_stdin); hook_stdin += "\n------\n"; temp_ad = r_job->dest_ad; temp_ad.sPrint(hook_stdin); ExitClient *exit_client = new ExitClient(hook_job_exit, r_job); if (NULL == exit_client) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookJobExit: " "failed to create exit client\n"); return -1; } set_user_from_ad(r_job->src_ad); if (0 == spawn(exit_client, NULL, &hook_stdin, PRIV_USER_FINAL)) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookJobExit: " "failed to spawn HOOK_JOB_EXIT (%s)\n", hook_job_exit); delete exit_client; return -1; } uninit_user_ids(); // Add our info to the list of hooks currently running for this job. if (false == JobRouterHookMgr::addKnownHook(key.c_str(), HOOK_JOB_EXIT)) { dprintf(D_ALWAYS, "ERROR in JobRouterHookMgr::hookJobExit: " "failed to add HOOK_JOB_EXIT to list of " "hooks running for job key %s\n", key.c_str()); } dprintf(D_FULLDEBUG, "HOOK_JOB_EXIT (%s) invoked.\n", hook_job_exit); return 1; }
int JobRouterHookMgr::hookTranslateJob(RoutedJob* r_job, std::string &route_info) { ClassAd temp_ad; char* hook_translate = getHookPath(HOOK_TRANSLATE_JOB, r_job->src_ad); if (NULL == hook_translate) { // hook not defined, which is ok dprintf(D_FULLDEBUG, "HOOK_TRANSLATE_JOB not configured.\n"); return 0; } // Verify the translate hook hasn't already been spawned and that // we're not waiting for it to return. std::string key = r_job->src_key; if (true == JobRouterHookMgr::checkHookKnown(key.c_str(), HOOK_TRANSLATE_JOB)) { dprintf(D_FULLDEBUG, "JobRouterHookMgr::hookTranslateJob " "retried while still waiting for translate hook to " "return for job with key %s - ignoring\n", key.c_str()); return 1; } temp_ad = r_job->src_ad; MyString hook_stdin; hook_stdin = route_info.c_str(); hook_stdin += "\n------\n"; temp_ad.sPrint(hook_stdin); TranslateClient* translate_client = new TranslateClient(hook_translate, r_job); if (NULL == translate_client) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookTranslateJob: " "failed to create translation client\n"); return -1; } set_user_from_ad(r_job->src_ad); if (0 == spawn(translate_client, NULL, &hook_stdin, PRIV_USER_FINAL)) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookTranslateJob: " "failed to spawn HOOK_TRANSLATE_JOB (%s)\n", hook_translate); delete translate_client; return -1; } uninit_user_ids(); // Add our info to the list of hooks currently running for this job. if (false == JobRouterHookMgr::addKnownHook(key.c_str(), HOOK_TRANSLATE_JOB)) { dprintf(D_ALWAYS, "ERROR in JobRouterHookMgr::hookTranslateJob: " "failed to add HOOK_TRANSLATE_JOB to list of " "hooks running for job key %s\n", key.c_str()); } dprintf(D_FULLDEBUG, "HOOK_TRANSLATE_JOB (%s) invoked.\n", hook_translate); return 1; }
GridUniverseLogic::gman_node_t * GridUniverseLogic::StartOrFindGManager(const char* owner, const char* domain, const char* attr_value, const char* attr_name, int cluster, int proc) { gman_node_t* gman_node; int pid; // If attr_value is an empty string, convert to NULL since code // after this point expects that. if ( attr_value && strlen(attr_value)==0 ) { attr_value = NULL; attr_name = NULL; } if ( (gman_node=lookupGmanByOwner(owner, attr_value, cluster, proc)) ) { // found it return gman_node; } // not found. fire one up! we want to run the GManager as the user. // but first, make certain we are not shutting down... if (!gman_pid_table) { // destructor has already been called; we are probably // closing down. return NULL; } #ifndef WIN32 if (owner && strcasecmp(owner, "root") == 0 ) { dprintf(D_ALWAYS, "Tried to start condor_gmanager as root.\n"); return NULL; } #endif dprintf( D_FULLDEBUG, "Starting condor_gmanager for owner %s (%d.%d)\n", owner, cluster, proc); char *gman_binary; gman_binary = param("GRIDMANAGER"); if ( !gman_binary ) { dprintf(D_ALWAYS,"ERROR - GRIDMANAGER not defined in config file\n"); return NULL; } ArgList args; MyString error_msg; args.AppendArg("condor_gridmanager"); args.AppendArg("-f"); char *gman_args = param("GRIDMANAGER_ARGS"); if(!args.AppendArgsV1RawOrV2Quoted(gman_args,&error_msg)) { dprintf( D_ALWAYS, "ERROR: failed to parse gridmanager args: %s\n", error_msg.Value()); free(gman_binary); free(gman_args); return NULL; } free(gman_args); // build a constraint if ( !owner ) { dprintf(D_ALWAYS,"ERROR - missing owner field\n"); free(gman_binary); return NULL; } MyString constraint; if ( !attr_name ) { constraint.formatstr("(%s=?=\"%s\"&&%s==%d)", ATTR_OWNER,owner, ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID); } else { constraint.formatstr("(%s=?=\"%s\"&&%s=?=\"%s\"&&%s==%d)", ATTR_OWNER,owner, attr_name,attr_value, ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID); args.AppendArg("-A"); args.AppendArg(attr_value); } args.AppendArg("-C"); args.AppendArg(constraint.Value()); MyString full_owner_name(owner); if ( domain && *domain ) { full_owner_name.formatstr_cat( "@%s", domain ); } args.AppendArg("-o"); args.AppendArg(full_owner_name.Value()); if (!init_user_ids(owner, domain)) { dprintf(D_ALWAYS,"ERROR - init_user_ids() failed in GRIDMANAGER\n"); free(gman_binary); return NULL; } static bool first_time_through = true; if ( first_time_through ) { // Note: Because first_time_through is static, this block runs only // once per schedd invocation. first_time_through = false; // Clean up any old / abandoned scratch dirs. dprintf(D_FULLDEBUG,"Checking for old gridmanager scratch dirs\n"); char *prefix = temp_dir_path(); ASSERT(prefix); Directory tmp( prefix, PRIV_USER ); const char *f; char const *dot; int fname_pid; int mypid = daemonCore->getpid(); int scratch_pre_len = strlen(scratch_prefix); while ( (f=tmp.Next()) ) { // skip regular files -- we only need to inspect subdirs if ( !tmp.IsDirectory() ) { continue; } // skip if it does not start with our prefix if ( strncmp(scratch_prefix,f,scratch_pre_len) ) { continue; } // skip if does not end w/ a pid dot = strrchr(f,'.'); if ( !dot ) { continue; } // skip if this pid is still alive and not ours dot++; // skip over period fname_pid = atoi(dot); if ( fname_pid != mypid && daemonCore->Is_Pid_Alive(fname_pid) ) { continue; } // if we made it here, blow away this subdir if ( tmp.Remove_Current_File() ) { dprintf(D_ALWAYS,"Removed old scratch dir %s\n", tmp.GetFullPath()); } } // end of while for cleanup of old scratch dirs dprintf(D_FULLDEBUG,"Done checking for old scratch dirs\n"); if (prefix != NULL) { free(prefix); prefix = NULL; } } // end of once-per-schedd invocation block // Create a temp dir for the gridmanager and append proper // command-line arguments to tell where it is. bool failed = false; gman_node = new gman_node_t; char *finalpath = scratchFilePath(gman_node); priv_state saved_priv = set_user_priv(); if ( (mkdir(finalpath,0700)) < 0 ) { // mkdir failed. dprintf(D_ALWAYS,"ERROR - mkdir(%s,0700) failed in GRIDMANAGER, errno=%d (%s)\n", finalpath, errno, strerror(errno)); failed = true; } set_priv(saved_priv); uninit_user_ids(); args.AppendArg("-S"); // -S = "ScratchDir" argument args.AppendArg(finalpath); delete [] finalpath; if ( failed ) { // we already did dprintf reason to the log... free(gman_binary); delete gman_node; return NULL; } if(IsFulldebug(D_FULLDEBUG)) { MyString args_string; args.GetArgsStringForDisplay(&args_string); dprintf(D_FULLDEBUG,"Really Execing %s\n",args_string.Value()); } pid = daemonCore->Create_Process( gman_binary, // Program to exec args, // Command-line args PRIV_ROOT, // Run as root, so it can switch to // PRIV_CONDOR rid // Reaper ID ); free(gman_binary); if ( pid <= 0 ) { dprintf ( D_ALWAYS, "StartOrFindGManager: Create_Process problems!\n" ); if (gman_node) delete gman_node; return NULL; } // If we made it here, we happily started up a new gridmanager process dprintf( D_ALWAYS, "Started condor_gmanager for owner %s pid=%d\n", owner,pid); // Make a new gman_node entry for our hashtable & insert it if ( !gman_node ) { gman_node = new gman_node_t; } gman_node->pid = pid; gman_node->owner[0] = '\0'; gman_node->domain[0] = '\0'; if ( owner ) { strcpy(gman_node->owner,owner); } if ( domain ) { strcpy(gman_node->domain,domain); } MyString owner_key(owner); if(attr_value){ owner_key += attr_value; } if (cluster) { owner_key.formatstr_cat( "-%d.%d", cluster, proc ); } ASSERT( gman_pid_table->insert(owner_key,gman_node) == 0 ); // start timer to signal gridmanager if we haven't already if ( gman_node->add_timer_id == -1 ) { // == -1 means no timer set gman_node->add_timer_id = daemonCore->Register_Timer(job_added_delay, GridUniverseLogic::SendAddSignal, "GridUniverseLogic::SendAddSignal"); daemonCore->Register_DataPtr(gman_node); } // All done return gman_node; }
int GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status) { gman_node_t* gman_node = NULL; MyString owner; // Iterate through our table to find the node w/ this pid // Someday we should perhaps also hash on the pid, but we // don't expect gridmanagers to exit very often, and there // are not that many of them. if (gman_pid_table) { gman_node_t* tmpnode; gman_pid_table->startIterations(); while ( gman_pid_table->iterate(owner,tmpnode) ) { if (tmpnode->pid == pid ) { // found it! gman_node = tmpnode; break; } } } MyString owner_safe; MyString exit_reason; if(gman_node) { owner_safe = owner; } else { owner_safe = "Unknown"; } if ( WIFEXITED( exit_status ) ) { exit_reason.formatstr( "with return code %d", WEXITSTATUS( exit_status ) ); } else { exit_reason.formatstr( "due to %s", daemonCore->GetExceptionString( exit_status ) ); } dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n", pid, owner_safe.Value(), exit_reason.Value() ); if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) { const char *condorUserName = get_condor_username(); dprintf(D_ALWAYS, "The gridmanager had a problem writing its log. " "Check the permissions of the file specified by GRIDMANAGER_LOG; " "it needs to be writable by Condor.\n"); /* send email to the admin about this, but only * every six hours - enough to not be ignored, but * not enough to be a pest. If only my children were * so helpful and polite. Ah, well, we can always dream... */ static time_t last_email_re_gridmanlog = 0; if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) { last_email_re_gridmanlog = time(NULL); FILE *email = email_admin_open("Unable to launch grid universe jobs."); if ( email ) { fprintf(email, "The condor_gridmanager had an error writing its log file. Check the \n" "permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n" "the condor_config file. This file needs to be writable as user %s to enable\n" "the condor_gridmanager daemon to write to it. \n\n" "Until this problem is fixed, grid universe jobs submitted from this machine cannot " "be launched.\n", condorUserName ? condorUserName : "******" ); email_close(email); } else { // Error sending an email message dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n"); } } } // end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) if (!gman_node) { // nothing more to do, so return return 0; } // Cancel any timers before removing the node!! if (gman_node->add_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->add_timer_id); } if (gman_node->remove_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->remove_timer_id); } // Remove node from our hash table gman_pid_table->remove(owner); // Remove any scratch directory used by this gridmanager char *scratchdir = scratchFilePath(gman_node); ASSERT(scratchdir); if ( IsDirectory(scratchdir) && init_user_ids(gman_node->owner, gman_node->domain) ) { priv_state saved_priv = set_user_priv(); // Must put this in braces so the Directory object // destructor is called, which will free the iterator // handle. If we didn't do this, the below rmdir // would fail. { Directory tmp( scratchdir ); tmp.Remove_Entire_Directory(); } if ( rmdir(scratchdir) == 0 ) { dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir); } else { dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n", scratchdir); } set_priv(saved_priv); uninit_user_ids(); } delete [] scratchdir; // Reclaim memory from the node itself delete gman_node; return 0; }
int JobRouterHookMgr::hookUpdateJobInfo(RoutedJob* r_job) { ClassAd temp_ad; char* hook_update_job_info = getHookPath(HOOK_UPDATE_JOB_INFO, r_job->src_ad); if (NULL == hook_update_job_info) { // hook not defined dprintf(D_FULLDEBUG, "HOOK_UPDATE_JOB_INFO not configured.\n"); return 0; } // Verify the status hook hasn't already been spawned and that // we're not waiting for it to return. std::string key = r_job->dest_key; if (true == JobRouterHookMgr::checkHookKnown(key.c_str(), HOOK_UPDATE_JOB_INFO)) { dprintf(D_FULLDEBUG, "JobRouterHookMgr::hookUpdateJobInfo " "retried while still waiting for status hook to return " "for job with key %s - ignoring\n", key.c_str()); return 1; } temp_ad = r_job->dest_ad; MyString hook_stdin; sPrintAd(hook_stdin, temp_ad); StatusClient* status_client = new StatusClient(hook_update_job_info, r_job); if (NULL == status_client) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookUpdateJobInfo: " "failed to create status update client\n"); return -1; } set_user_priv_from_ad(r_job->src_ad); if (0 == spawn(status_client, NULL, &hook_stdin, PRIV_USER_FINAL)) { dprintf(D_ALWAYS|D_FAILURE, "ERROR in JobRouterHookMgr::hookUpdateJobInfo: " "failed to spawn HOOK_UPDATE_JOB_INFO (%s)\n", hook_update_job_info); delete status_client; return -1; } uninit_user_ids(); // Add our info to the list of hooks currently running for this job. if (false == JobRouterHookMgr::addKnownHook(key.c_str(), HOOK_UPDATE_JOB_INFO)) { dprintf(D_ALWAYS, "ERROR in JobRouterHookMgr::hookUpdateJobInfo: " "failed to add HOOK_UPDATE_JOB_INFO to list of " "hooks running for job key %s\n", key.c_str()); } dprintf(D_FULLDEBUG, "HOOK_UPDATE_JOB_INFO (%s) invoked.\n", hook_update_job_info); return 1; }