int init_user_id_from_FQN (const char * _fqn) { char * uid = NULL; char * domain = NULL; char * fqn = NULL; char default_uid [] = "nobody"; if (_fqn) { fqn = strdup (_fqn); uid = fqn; // Domain? char * pAt = strchr (fqn, '@'); if (pAt) { *pAt='\0'; domain = pAt+1; } } if (uid == NULL) { uid = default_uid; } int rc = init_user_ids (uid, domain); dprintf (D_FULLDEBUG, "Switching to user %s@%s, result = %d\n", uid, domain, rc); if (fqn) free (fqn); return rc; }
void main_pre_dc_init( int argc, char* argv[] ) { // handle -o, so that we can switch euid to the user before // daemoncore does most of its initialization work. int i = 1; while ( i < argc ) { if ( !strcmp( argv[i], "-o" ) ) { // Say what user we're running jobs on behave of. // If the schedd starts us as root, we need to switch to // this uid for most of our life. if ( argc <= i + 1 ) { usage( argv[0] ); } myUserName = strdup( argv[i + 1] ); break; } i++; } if ( myUserName ) { char *owner = strdup( myUserName ); char *domain = strchr( owner, '@' ); if ( domain ) { *domain = '\0'; domain = domain + 1; } if ( !init_user_ids(owner, domain)) { dprintf(D_ALWAYS, "init_user_ids() failed!\n"); // uids.C will EXCEPT when we set_user_priv() now // so there's not much we can do at this point } set_user_priv(); // We can't call daemonCore->Register_Priv_State() here because // there's no daemonCore object yet. We'll call it in main_init(). free( myUserName ); myUserName = owner; } else if ( is_root() ) { dprintf( D_ALWAYS, "Don't know what user to run as!\n" ); DC_Exit( 1 ); } else { myUserName = my_username(); } }
void initialize_uids(void) { #if defined(WIN32) #include "my_username.h" char *name = NULL; char *domain = NULL; name = my_username(); domain = my_domainname(); caller_name = name; job_user_name = name; if ( !init_user_ids(name, domain ) ) { // shouldn't happen - we always can get our own token vmprintf(D_ALWAYS, "Could not initialize user_priv with our own token!\n"); } vmprintf(D_ALWAYS, "Initialize Uids: caller=%s@%s, job user=%s@%s\n", caller_name.Value(), domain, job_user_name.Value(), domain); if( name ) { free(name); } if( domain ) { free(domain); } return; #else // init_user_ids was called in main_pre_dc_init() vmprintf(D_ALWAYS, "Initial UID/GUID=%d/%d, EUID/EGUID=%d/%d, " "Condor UID/GID=%d,%d\n", (int)getuid(), (int)getuid(), (int)geteuid(), (int)getegid(), (int)get_condor_uid(), (int)get_condor_gid()); vmprintf(D_ALWAYS, "Initialize Uids: caller=%s, job user=%s\n", caller_name.Value(), job_user_name.Value()); return; #endif }
bool XInterface::TryUser(const char *user) { static char env[1024]; static bool need_uninit = false; passwd *passwd_entry; passwd_entry = getpwnam(user); if(passwd_entry == NULL) { // We couldn't find the current user in the passwd file? dprintf( D_FULLDEBUG, "Current user cannot be found in passwd file.\n" ); return false; } else { sprintf(env, "XAUTHORITY=%s/.Xauthority", passwd_entry->pw_dir); if(putenv(env) != 0) { EXCEPT("Putenv failed!."); } } if ( need_uninit ) { uninit_user_ids(); need_uninit = false; } // passing "root" to init_user_ids is fatal if (strcmp(user, "root") == 0) { set_root_priv(); } else { init_user_ids( user, NULL ); set_user_priv(); need_uninit = true; } dprintf( D_FULLDEBUG, "Using %s's .Xauthority: \n", passwd_entry->pw_name ); return true; }
void BaseShadow::baseInit( ClassAd *job_ad, const char* schedd_addr, const char *xfer_queue_contact_info ) { int pending = FALSE; if( ! job_ad ) { EXCEPT("baseInit() called with NULL job_ad!"); } jobAd = job_ad; if (sendUpdatesToSchedd && ! is_valid_sinful(schedd_addr)) { EXCEPT("schedd_addr not specified with valid address"); } scheddAddr = sendUpdatesToSchedd ? strdup( schedd_addr ) : strdup("noschedd"); m_xfer_queue_contact_info = xfer_queue_contact_info; if ( !jobAd->LookupString(ATTR_OWNER, owner)) { EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_OWNER); } if( !jobAd->LookupInteger(ATTR_CLUSTER_ID, cluster)) { EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_CLUSTER_ID); } if( !jobAd->LookupInteger(ATTR_PROC_ID, proc)) { EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_PROC_ID); } // Grab the GlobalJobId if we've got it. if( ! jobAd->LookupString(ATTR_GLOBAL_JOB_ID, &gjid) ) { gjid = NULL; } // grab the NT domain if we've got it jobAd->LookupString(ATTR_NT_DOMAIN, domain); if ( !jobAd->LookupString(ATTR_JOB_IWD, iwd)) { EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_JOB_IWD); } if( !jobAd->LookupFloat(ATTR_BYTES_SENT, prev_run_bytes_sent) ) { prev_run_bytes_sent = 0; } if( !jobAd->LookupFloat(ATTR_BYTES_RECVD, prev_run_bytes_recvd) ) { prev_run_bytes_recvd = 0; } // construct the core file name we'd get if we had one. MyString tmp_name = iwd; tmp_name += DIR_DELIM_CHAR; tmp_name += "core."; tmp_name += cluster; tmp_name += '.'; tmp_name += proc; core_file_name = strdup( tmp_name.Value() ); // put the shadow's sinful string into the jobAd. Helpful for // the mpi shadow, at least...and a good idea in general. MyString tmp_addr = ATTR_MY_ADDRESS; tmp_addr += "=\""; tmp_addr += daemonCore->InfoCommandSinfulString(); tmp_addr += '"'; if ( !jobAd->Insert( tmp_addr.Value() )) { EXCEPT( "Failed to insert %s!", ATTR_MY_ADDRESS ); } DebugId = display_dprintf_header; config(); // Make sure we've got enough swap space to run checkSwap(); // handle system calls with Owner's privilege // XXX this belong here? We'll see... // Calling init_user_ids() while in user priv causes badness. // Make sure we're in another priv state. set_condor_priv(); if ( !init_user_ids(owner.Value(), domain.Value())) { dprintf(D_ALWAYS, "init_user_ids() failed as user %s\n",owner.Value() ); // uids.C will EXCEPT when we set_user_priv() now // so there's not much we can do at this point #if ! defined(WIN32) if ( param_boolean( "SHADOW_RUN_UNKNOWN_USER_JOBS", false ) ) { dprintf(D_ALWAYS, "trying init_user_ids() as user nobody\n" ); owner="nobody"; domain=NULL; if (!init_user_ids(owner.Value(), domain.Value())) { dprintf(D_ALWAYS, "init_user_ids() failed!\n"); } else { jobAd->Assign( ATTR_JOB_RUNAS_OWNER, "FALSE" ); m_RunAsNobody=true; dprintf(D_ALWAYS, "init_user_ids() now running as user nobody\n"); } } #endif } set_user_priv(); daemonCore->Register_Priv_State( PRIV_USER ); dumpClassad( "BaseShadow::baseInit()", this->jobAd, D_JOB ); // initialize the UserPolicy object shadow_user_policy.init( jobAd, this ); // setup an object to keep our job ad updated to the schedd's // permanent job queue. this clears all the dirty bits on our // copy of the classad, so anything we touch after this will // be updated to the schedd when appropriate. // Unless we got a command line arg asking us not to if (sendUpdatesToSchedd) { // the usual case job_updater = new QmgrJobUpdater( jobAd, scheddAddr, CondorVersion() ); } else { job_updater = new NullQmgrJobUpdater( jobAd, scheddAddr, CondorVersion() ); } // init user log; hold on failure // NOTE: job_updater must be initialized _before_ initUserLog(), // in order to handle the case of the job going on hold as a // result of failure in initUserLog(). initUserLog(); // change directory; hold on failure if ( cdToIwd() == -1 ) { EXCEPT("Could not cd to initial working directory"); } // check to see if this invocation of the shadow is just to write // a terminate event and exit since this job had been recorded as // pending termination, but somehow the job didn't leave the queue // and the schedd is trying to restart it again.. if( jobAd->LookupInteger(ATTR_TERMINATION_PENDING, pending)) { if (pending == TRUE) { // If the classad of this job "thinks" that this job should be // finished already, let's enact that belief. // This function does not return. this->terminateJob(US_TERMINATE_PENDING); } } // If we need to claim the startd before activating the claim int wantClaiming = 0; jobAd->LookupBool(ATTR_CLAIM_STARTD, wantClaiming); if (wantClaiming) { MyString startdSinful; MyString claimid; // Pull startd addr and claimid out of the jobad jobAd->LookupString(ATTR_STARTD_IP_ADDR, startdSinful); jobAd->LookupString(ATTR_CLAIM_ID, claimid); dprintf(D_ALWAYS, "%s is true, trying to claim startd %s\n", ATTR_CLAIM_STARTD, startdSinful.Value()); classy_counted_ptr<DCStartd> startd = new DCStartd("description", NULL, startdSinful.Value(), claimid.Value()); classy_counted_ptr<DCMsgCallback> cb = new DCMsgCallback((DCMsgCallback::CppFunction)&BaseShadow::startdClaimedCB, this, jobAd); // this can't fail, will always call the callback startd->asyncRequestOpportunisticClaim(jobAd, "description", daemonCore->InfoCommandSinfulString(), 1200 /*alive interval*/, 20 /* net timeout*/, 100 /*total timeout*/, cb); } }
bool JICLocal::initUserPriv( void ) { bool rval = false; #ifdef WIN32 /* If we're on windoze, and this is anything but a local universe job, we should immediately try the windoze-specific method for per-slot users, setting up a nobody account, etc, and be done with it. However, if it's local universe, we basically want to do what we do for the unix case: find ATTR_OWNER (and ATTR_NT_DOMAIN) from the job ad and initialize ourselves with that... */ if( job_universe != CONDOR_UNIVERSE_LOCAL ) { return initUserPrivWindows(); } #endif // Before we go through any trouble, see if we even need // ATTR_OWNER to initialize user_priv. If not, go ahead and // initialize it as appropriate. if( initUserPrivNoOwner() ) { return true; } char* owner = NULL; if( job_ad->LookupString( ATTR_OWNER, &owner ) != 1 ) { dprintf( D_ALWAYS, "ERROR: %s not found in JobAd. Aborting.\n", ATTR_OWNER ); return false; } #ifdef WIN32 // we only care about or expect to find this attribute if // we're on windoze... char* domain = NULL; if( job_ad->LookupString( ATTR_NT_DOMAIN, &domain ) != 1 ) { dprintf( D_ALWAYS, "ERROR: %s not found in JobAd. Aborting.\n", ATTR_NT_DOMAIN ); return false; } if( ! init_user_ids(owner,domain) ) { dprintf( D_ALWAYS, "ERROR: Bad or missing credential for user \"%s@%s\"\n", owner, domain ); } else { rval = true; dprintf( D_FULLDEBUG, "Initialized user_priv as \"%s@%s\"\n", owner, domain ); } if( domain ) { free( domain ); domain = NULL; } #else /* UNIX */ if( ! init_user_ids_quiet(owner) ) { dprintf( D_ALWAYS, "ERROR: Uid for \"%s\" not found in " "passwd database for a local job\n", owner ); } else { CondorPrivSepHelper* psh = Starter->condorPrivSepHelper(); if (psh != NULL) { psh->initialize_user(owner); } rval = true; dprintf( D_FULLDEBUG, "Initialized user_priv as \"%s\"\n", owner ); } #endif // deallocate owner string so we don't leak memory. free( owner ); owner = NULL; if( rval ) { user_priv_is_initialized = true; } return rval; }
void HandleSyscalls() { register int cnt; fd_set readfds; int nfds = -1; time_t periodic_interval_len = 20; /* secs, empirically found :) */ nfds = (RSC_SOCK > CLIENT_LOG ) ? (RSC_SOCK + 1) : (CLIENT_LOG + 1); init_user_ids(Proc->owner, NULL); set_user_priv(); dprintf(D_FULLDEBUG, "HandleSyscalls: about to chdir(%s)\n", Proc->iwd); if( chdir(Proc->iwd) < 0 ) { sprintf( ErrBuf, "Can't chdir() to \"%s\"! [%s(%d)]", Proc->iwd, strerror(errno), errno ); HadErr = TRUE; return; } dprintf(D_SYSCALLS, "Shadow: Starting to field syscall requests\n"); errno = 0; time_t current_time = time(0); time_t next_periodic_update = current_time + periodic_interval_len; for(;;) { /* get a request and fulfill it */ FD_ZERO(&readfds); FD_SET(RSC_SOCK, &readfds); FD_SET(CLIENT_LOG, &readfds); struct timeval *ptimer = NULL, timer; timer.tv_sec = next_periodic_update - current_time; timer.tv_usec = 0; ptimer = &timer; /* if the current timer is set for a time longer than this, than truncate the timer required to the periodic limit. After inspection of the bandwidth timer, it seems that it will recorrect itself if select comes out of the loop before the timer goes off anyway to handle syscalls */ if ( timer.tv_sec > periodic_interval_len) { timer.tv_sec = next_periodic_update - current_time; ptimer = &timer; } unblock_signal(SIGCHLD); unblock_signal(SIGUSR1); #if defined(LINUX) || defined(Solaris) cnt = select(nfds, &readfds, (fd_set *)0, (fd_set *)0, ptimer); #else cnt = select(nfds, &readfds, 0, 0, ptimer); #endif block_signal(SIGCHLD); block_signal(SIGUSR1); if( cnt < 0 && errno != EINTR ) { EXCEPT("HandleSyscalls: select: errno=%d, rsc_sock=%d, client_log=%d",errno,RSC_SOCK,CLIENT_LOG); } if( cnt < 0 && errno == EINTR ) { continue; } if( FD_ISSET(CLIENT_LOG, &readfds) ) { if( HandleLog() < 0 ) { EXCEPT( "Peer went away" ); } } if( FD_ISSET(RSC_SOCK, &readfds) ) { if( do_REMOTE_syscall() < 0 ) { dprintf(D_SYSCALLS, "Shadow: do_REMOTE_syscall returned < 0\n"); break; } } if( FD_ISSET(UMBILICAL, &readfds) ) { dprintf(D_ALWAYS, "Shadow: Local scheduler apparently died, so I die too\n"); exit(1); } current_time = time(0); /* if this is true, then do the periodic_interval_len events */ if (current_time >= next_periodic_update) { next_periodic_update = current_time + periodic_interval_len; /* evaluate some attributes for policies like determining what to do if a job suspends wierdly or some such thing. This function has the possibility of making the shadow exit with JOB_SHOULD_HOLD or futzing up some global variables about how the job could've exited and letting Wraup take care of it. */ if (periodic_policy() == true) { break; } } #if defined(SYSCALL_DEBUG) strcpy( SyscallLabel, "shadow" ); #endif } /* The user job might exit while there is still unread data in the log. So, select with a timeout of zero, and flush everything from the log. */ /* NOTE: Since HandleLog does it's own loop to make sure it's read everything, we don't need a loop here, and should only call HandleLog once. In fact, if there's a problem w/ select(), a loop here can cause an infinite loop. -Derek Wright and Jim Basney, 2/17/99. */ HandleLog(); /* Take back normal condor privileges */ set_condor_priv(); /* If we are debugging with named pipes as our communications medium, won't have a condor_startd running - don't try to send to it. */ if( !UsePipes ) { send_quit( ExecutingHost, GlobalCap ); } dprintf(D_ALWAYS, "Shadow: Job %d.%d exited, termsig = %d, coredump = %d, retcode = %d\n", Proc->id.cluster, Proc->id.proc, WTERMSIG(JobStatus), WCOREDUMP(JobStatus), WEXITSTATUS(JobStatus)); }
bool JobInfoCommunicator::initUserPrivWindows( void ) { // Win32 // taken origionally from OsProc::StartJob. Here we create the // user and initialize user_priv. // By default, assume execute login may be shared by other processes. setExecuteAccountIsDedicated( NULL ); // we support running the job as other users if the user // is specifed in the config file, and the account's password // is properly stored in our credential stash. char *name = NULL; char *domain = NULL; bool init_priv_succeeded = true; bool run_as_owner = allowRunAsOwner( false, false ); // TODO.. // Currently vmgahp for VMware VM universe can't run as user on Windows. // It seems like a bug of VMware. VMware command line tool such as "vmrun" // requires Administrator privilege. // So here we set name and domain with my_username and my_domainname // -jaeyoung 06/15/07 if( job_universe == CONDOR_UNIVERSE_VM ) { #if 0 // If "VM_UNIV_NOBODY_USER" is defined in Condor configuration file, // wee will use it. char *vm_jobs_as = param("VM_UNIV_NOBODY_USER"); if (vm_jobs_as) { getDomainAndName(vm_jobs_as, domain, name); /* * name and domain are now just pointers into vm_jobs_as * buffer. copy these values into their own buffer so we * deallocate below. */ if ( name ) { name = strdup(name); } if ( domain ) { domain = strdup(domain); } free(vm_jobs_as); } #endif MyString vm_type; job_ad->LookupString(ATTR_JOB_VM_TYPE, vm_type); if( strcasecmp(vm_type.Value(), CONDOR_VM_UNIVERSE_VMWARE) == MATCH ) { name = my_username(); domain = my_domainname(); } } if( !name ) { if ( run_as_owner ) { job_ad->LookupString(ATTR_OWNER,&name); job_ad->LookupString(ATTR_NT_DOMAIN,&domain); } } if ( !name ) { char slot_user[255]; MyString slotName = ""; slotName = Starter->getMySlotName(); slotName.upper_case(); sprintf(slot_user, "%s_USER", slotName); char *run_jobs_as = param(slot_user); if (run_jobs_as) { getDomainAndName(run_jobs_as, domain, name); /* * name and domain are now just pointers into run_jobs_as * buffer. copy these values into their own buffer so we * deallocate below. */ if ( name ) { name = strdup(name); } if ( domain ) { domain = strdup(domain); } free(run_jobs_as); } } if ( name ) { if (!init_user_ids(name, domain)) { dprintf(D_ALWAYS, "Could not initialize user_priv as \"%s\\%s\".\n" "\tMake sure this account's password is securely stored " "with condor_store_cred.\n", domain, name ); init_priv_succeeded = false; } else { MyString login_name; joinDomainAndName(name, domain, login_name); if( checkDedicatedExecuteAccounts( login_name.Value() ) ) { setExecuteAccountIsDedicated( login_name.Value() ); } } } else if ( !can_switch_ids() ) { char *u = my_username(); char *d = my_domainname(); if ( !init_user_ids(u, d) ) { // shouldn't happen - we always can get our own token dprintf(D_ALWAYS, "Could not initialize user_priv with our own token!\n"); init_priv_succeeded = false; } free(u); free(d); } else if( init_user_ids("nobody", ".") ) { // just init a new nobody user; dynuser handles the rest. // the "." means Local Machine to LogonUser setExecuteAccountIsDedicated( get_user_loginname() ); } else { dprintf( D_ALWAYS, "ERROR: Could not initialize user_priv " "as \"nobody\"\n" ); init_priv_succeeded = false; } if ( name ) free(name); if ( domain ) free(domain); user_priv_is_initialized = init_priv_succeeded; return init_priv_succeeded; }
GridUniverseLogic::gman_node_t * GridUniverseLogic::StartOrFindGManager(const char* owner, const char* domain, const char* attr_value, const char* attr_name, int cluster, int proc) { gman_node_t* gman_node; int pid; // If attr_value is an empty string, convert to NULL since code // after this point expects that. if ( attr_value && strlen(attr_value)==0 ) { attr_value = NULL; attr_name = NULL; } if ( (gman_node=lookupGmanByOwner(owner, attr_value, cluster, proc)) ) { // found it return gman_node; } // not found. fire one up! we want to run the GManager as the user. // but first, make certain we are not shutting down... if (!gman_pid_table) { // destructor has already been called; we are probably // closing down. return NULL; } #ifndef WIN32 if (owner && strcasecmp(owner, "root") == 0 ) { dprintf(D_ALWAYS, "Tried to start condor_gmanager as root.\n"); return NULL; } #endif dprintf( D_FULLDEBUG, "Starting condor_gmanager for owner %s (%d.%d)\n", owner, cluster, proc); char *gman_binary; gman_binary = param("GRIDMANAGER"); if ( !gman_binary ) { dprintf(D_ALWAYS,"ERROR - GRIDMANAGER not defined in config file\n"); return NULL; } ArgList args; MyString error_msg; args.AppendArg("condor_gridmanager"); args.AppendArg("-f"); char *gman_args = param("GRIDMANAGER_ARGS"); if(!args.AppendArgsV1RawOrV2Quoted(gman_args,&error_msg)) { dprintf( D_ALWAYS, "ERROR: failed to parse gridmanager args: %s\n", error_msg.Value()); free(gman_binary); free(gman_args); return NULL; } free(gman_args); // build a constraint if ( !owner ) { dprintf(D_ALWAYS,"ERROR - missing owner field\n"); free(gman_binary); return NULL; } MyString constraint; if ( !attr_name ) { constraint.formatstr("(%s=?=\"%s\"&&%s==%d)", ATTR_OWNER,owner, ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID); } else { constraint.formatstr("(%s=?=\"%s\"&&%s=?=\"%s\"&&%s==%d)", ATTR_OWNER,owner, attr_name,attr_value, ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID); args.AppendArg("-A"); args.AppendArg(attr_value); } args.AppendArg("-C"); args.AppendArg(constraint.Value()); MyString full_owner_name(owner); if ( domain && *domain ) { full_owner_name.formatstr_cat( "@%s", domain ); } args.AppendArg("-o"); args.AppendArg(full_owner_name.Value()); if (!init_user_ids(owner, domain)) { dprintf(D_ALWAYS,"ERROR - init_user_ids() failed in GRIDMANAGER\n"); free(gman_binary); return NULL; } static bool first_time_through = true; if ( first_time_through ) { // Note: Because first_time_through is static, this block runs only // once per schedd invocation. first_time_through = false; // Clean up any old / abandoned scratch dirs. dprintf(D_FULLDEBUG,"Checking for old gridmanager scratch dirs\n"); char *prefix = temp_dir_path(); ASSERT(prefix); Directory tmp( prefix, PRIV_USER ); const char *f; char const *dot; int fname_pid; int mypid = daemonCore->getpid(); int scratch_pre_len = strlen(scratch_prefix); while ( (f=tmp.Next()) ) { // skip regular files -- we only need to inspect subdirs if ( !tmp.IsDirectory() ) { continue; } // skip if it does not start with our prefix if ( strncmp(scratch_prefix,f,scratch_pre_len) ) { continue; } // skip if does not end w/ a pid dot = strrchr(f,'.'); if ( !dot ) { continue; } // skip if this pid is still alive and not ours dot++; // skip over period fname_pid = atoi(dot); if ( fname_pid != mypid && daemonCore->Is_Pid_Alive(fname_pid) ) { continue; } // if we made it here, blow away this subdir if ( tmp.Remove_Current_File() ) { dprintf(D_ALWAYS,"Removed old scratch dir %s\n", tmp.GetFullPath()); } } // end of while for cleanup of old scratch dirs dprintf(D_FULLDEBUG,"Done checking for old scratch dirs\n"); if (prefix != NULL) { free(prefix); prefix = NULL; } } // end of once-per-schedd invocation block // Create a temp dir for the gridmanager and append proper // command-line arguments to tell where it is. bool failed = false; gman_node = new gman_node_t; char *finalpath = scratchFilePath(gman_node); priv_state saved_priv = set_user_priv(); if ( (mkdir(finalpath,0700)) < 0 ) { // mkdir failed. dprintf(D_ALWAYS,"ERROR - mkdir(%s,0700) failed in GRIDMANAGER, errno=%d (%s)\n", finalpath, errno, strerror(errno)); failed = true; } set_priv(saved_priv); uninit_user_ids(); args.AppendArg("-S"); // -S = "ScratchDir" argument args.AppendArg(finalpath); delete [] finalpath; if ( failed ) { // we already did dprintf reason to the log... free(gman_binary); delete gman_node; return NULL; } if(IsFulldebug(D_FULLDEBUG)) { MyString args_string; args.GetArgsStringForDisplay(&args_string); dprintf(D_FULLDEBUG,"Really Execing %s\n",args_string.Value()); } pid = daemonCore->Create_Process( gman_binary, // Program to exec args, // Command-line args PRIV_ROOT, // Run as root, so it can switch to // PRIV_CONDOR rid // Reaper ID ); free(gman_binary); if ( pid <= 0 ) { dprintf ( D_ALWAYS, "StartOrFindGManager: Create_Process problems!\n" ); if (gman_node) delete gman_node; return NULL; } // If we made it here, we happily started up a new gridmanager process dprintf( D_ALWAYS, "Started condor_gmanager for owner %s pid=%d\n", owner,pid); // Make a new gman_node entry for our hashtable & insert it if ( !gman_node ) { gman_node = new gman_node_t; } gman_node->pid = pid; gman_node->owner[0] = '\0'; gman_node->domain[0] = '\0'; if ( owner ) { strcpy(gman_node->owner,owner); } if ( domain ) { strcpy(gman_node->domain,domain); } MyString owner_key(owner); if(attr_value){ owner_key += attr_value; } if (cluster) { owner_key.formatstr_cat( "-%d.%d", cluster, proc ); } ASSERT( gman_pid_table->insert(owner_key,gman_node) == 0 ); // start timer to signal gridmanager if we haven't already if ( gman_node->add_timer_id == -1 ) { // == -1 means no timer set gman_node->add_timer_id = daemonCore->Register_Timer(job_added_delay, GridUniverseLogic::SendAddSignal, "GridUniverseLogic::SendAddSignal"); daemonCore->Register_DataPtr(gman_node); } // All done return gman_node; }
int GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status) { gman_node_t* gman_node = NULL; MyString owner; // Iterate through our table to find the node w/ this pid // Someday we should perhaps also hash on the pid, but we // don't expect gridmanagers to exit very often, and there // are not that many of them. if (gman_pid_table) { gman_node_t* tmpnode; gman_pid_table->startIterations(); while ( gman_pid_table->iterate(owner,tmpnode) ) { if (tmpnode->pid == pid ) { // found it! gman_node = tmpnode; break; } } } MyString owner_safe; MyString exit_reason; if(gman_node) { owner_safe = owner; } else { owner_safe = "Unknown"; } if ( WIFEXITED( exit_status ) ) { exit_reason.formatstr( "with return code %d", WEXITSTATUS( exit_status ) ); } else { exit_reason.formatstr( "due to %s", daemonCore->GetExceptionString( exit_status ) ); } dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n", pid, owner_safe.Value(), exit_reason.Value() ); if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) { const char *condorUserName = get_condor_username(); dprintf(D_ALWAYS, "The gridmanager had a problem writing its log. " "Check the permissions of the file specified by GRIDMANAGER_LOG; " "it needs to be writable by Condor.\n"); /* send email to the admin about this, but only * every six hours - enough to not be ignored, but * not enough to be a pest. If only my children were * so helpful and polite. Ah, well, we can always dream... */ static time_t last_email_re_gridmanlog = 0; if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) { last_email_re_gridmanlog = time(NULL); FILE *email = email_admin_open("Unable to launch grid universe jobs."); if ( email ) { fprintf(email, "The condor_gridmanager had an error writing its log file. Check the \n" "permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n" "the condor_config file. This file needs to be writable as user %s to enable\n" "the condor_gridmanager daemon to write to it. \n\n" "Until this problem is fixed, grid universe jobs submitted from this machine cannot " "be launched.\n", condorUserName ? condorUserName : "******" ); email_close(email); } else { // Error sending an email message dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n"); } } } // end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) if (!gman_node) { // nothing more to do, so return return 0; } // Cancel any timers before removing the node!! if (gman_node->add_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->add_timer_id); } if (gman_node->remove_timer_id != -1) { daemonCore->Cancel_Timer(gman_node->remove_timer_id); } // Remove node from our hash table gman_pid_table->remove(owner); // Remove any scratch directory used by this gridmanager char *scratchdir = scratchFilePath(gman_node); ASSERT(scratchdir); if ( IsDirectory(scratchdir) && init_user_ids(gman_node->owner, gman_node->domain) ) { priv_state saved_priv = set_user_priv(); // Must put this in braces so the Directory object // destructor is called, which will free the iterator // handle. If we didn't do this, the below rmdir // would fail. { Directory tmp( scratchdir ); tmp.Remove_Entire_Directory(); } if ( rmdir(scratchdir) == 0 ) { dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir); } else { dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n", scratchdir); } set_priv(saved_priv); uninit_user_ids(); } delete [] scratchdir; // Reclaim memory from the node itself delete gman_node; return 0; }