Esempio n. 1
0
int
init_user_id_from_FQN (const char * _fqn) {

  char * uid = NULL;
  char * domain = NULL;
  char * fqn = NULL;
  char default_uid [] = "nobody";

  if (_fqn) {
    fqn = strdup (_fqn);
    uid = fqn;

    // Domain?
    char * pAt = strchr (fqn, '@');
    if (pAt) {
      *pAt='\0';
      domain = pAt+1;
    }
  }
  
  if (uid == NULL) {
    uid = default_uid;
  }

  int rc = init_user_ids (uid, domain);
  dprintf (D_FULLDEBUG, "Switching to user %s@%s, result = %d\n", uid, domain, rc);

  if (fqn)
    free (fqn);

  return rc;
}
Esempio n. 2
0
void
main_pre_dc_init( int argc, char* argv[] )
{
	// handle -o, so that we can switch euid to the user before
	// daemoncore does most of its initialization work.
	int i = 1;
	while ( i < argc ) {
		if ( !strcmp( argv[i], "-o" ) ) {
			// Say what user we're running jobs on behave of.
			// If the schedd starts us as root, we need to switch to
			// this uid for most of our life.
			if ( argc <= i + 1 ) {
				usage( argv[0] );
			}
			myUserName = strdup( argv[i + 1] );
			break;
		}
		i++;
	}

	if ( myUserName ) {
		char *owner = strdup( myUserName );
		char *domain = strchr( owner, '@' );
		if ( domain ) {
			*domain = '\0';
			domain = domain + 1;
		}
		if ( !init_user_ids(owner, domain)) {
			dprintf(D_ALWAYS, "init_user_ids() failed!\n");
			// uids.C will EXCEPT when we set_user_priv() now
			// so there's not much we can do at this point
		}
		set_user_priv();
		// We can't call daemonCore->Register_Priv_State() here because
		// there's no daemonCore object yet. We'll call it in main_init().

		free( myUserName );
		myUserName = owner;
	} else if ( is_root() ) {
		dprintf( D_ALWAYS, "Don't know what user to run as!\n" );
		DC_Exit( 1 );
	} else {
		myUserName = my_username();
	}
}
Esempio n. 3
0
void 
initialize_uids(void)
{
#if defined(WIN32)
#include "my_username.h"

	char *name = NULL;
	char *domain = NULL;

	name = my_username();
	domain = my_domainname();

	caller_name = name;
	job_user_name = name;

	if ( !init_user_ids(name, domain ) ) {
		// shouldn't happen - we always can get our own token
		vmprintf(D_ALWAYS, "Could not initialize user_priv with our own token!\n");
	}

	vmprintf(D_ALWAYS, "Initialize Uids: caller=%s@%s, job user=%s@%s\n", 
			caller_name.Value(), domain, job_user_name.Value(), domain);

	if( name ) {
		free(name);
	}
	if( domain ) {
		free(domain);
	}
	return;
#else
	// init_user_ids was called in main_pre_dc_init()
	vmprintf(D_ALWAYS, "Initial UID/GUID=%d/%d, EUID/EGUID=%d/%d, "
			"Condor UID/GID=%d,%d\n", (int)getuid(), (int)getuid(), 
			(int)geteuid(), (int)getegid(), 
			(int)get_condor_uid(), (int)get_condor_gid());

	vmprintf(D_ALWAYS, "Initialize Uids: caller=%s, job user=%s\n", 
			caller_name.Value(), job_user_name.Value());
	
	return;
#endif
}
Esempio n. 4
0
bool
XInterface::TryUser(const char *user)
{
	static char env[1024];
	static bool need_uninit = false;
	passwd *passwd_entry;

	passwd_entry = getpwnam(user);
	if(passwd_entry == NULL) {
		// We couldn't find the current user in the passwd file?
		dprintf( D_FULLDEBUG, 
		 	"Current user cannot be found in passwd file.\n" );
		return false;
	} else {
		sprintf(env, "XAUTHORITY=%s/.Xauthority", passwd_entry->pw_dir);
		if(putenv(env) != 0) {
			EXCEPT("Putenv failed!.");
		}
	}

	if ( need_uninit ) {
		uninit_user_ids();
		need_uninit = false;
	} 

		// passing "root" to init_user_ids is fatal
	if (strcmp(user, "root") == 0) {
		set_root_priv();
	} else {
		init_user_ids( user, NULL );
		set_user_priv();
		need_uninit = true;
	}

	dprintf( D_FULLDEBUG, "Using %s's .Xauthority: \n", passwd_entry->pw_name );
	return true;
}
Esempio n. 5
0
void
BaseShadow::baseInit( ClassAd *job_ad, const char* schedd_addr, const char *xfer_queue_contact_info )
{
	int pending = FALSE;

	if( ! job_ad ) {
		EXCEPT("baseInit() called with NULL job_ad!");
	}
	jobAd = job_ad;

	if (sendUpdatesToSchedd && ! is_valid_sinful(schedd_addr)) {
		EXCEPT("schedd_addr not specified with valid address");
	}
	scheddAddr = sendUpdatesToSchedd ? strdup( schedd_addr ) : strdup("noschedd");

	m_xfer_queue_contact_info = xfer_queue_contact_info;

	if ( !jobAd->LookupString(ATTR_OWNER, owner)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_OWNER);
	}

	if( !jobAd->LookupInteger(ATTR_CLUSTER_ID, cluster)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_CLUSTER_ID);
	}

	if( !jobAd->LookupInteger(ATTR_PROC_ID, proc)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_PROC_ID);
	}


		// Grab the GlobalJobId if we've got it.
	if( ! jobAd->LookupString(ATTR_GLOBAL_JOB_ID, &gjid) ) {
		gjid = NULL;
	}

	// grab the NT domain if we've got it
	jobAd->LookupString(ATTR_NT_DOMAIN, domain);
	if ( !jobAd->LookupString(ATTR_JOB_IWD, iwd)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_JOB_IWD);
	}

	if( !jobAd->LookupFloat(ATTR_BYTES_SENT, prev_run_bytes_sent) ) {
		prev_run_bytes_sent = 0;
	}
	if( !jobAd->LookupFloat(ATTR_BYTES_RECVD, prev_run_bytes_recvd) ) {
		prev_run_bytes_recvd = 0;
	}

		// construct the core file name we'd get if we had one.
	MyString tmp_name = iwd;
	tmp_name += DIR_DELIM_CHAR;
	tmp_name += "core.";
	tmp_name += cluster;
	tmp_name += '.';
	tmp_name += proc;
	core_file_name = strdup( tmp_name.Value() );

        // put the shadow's sinful string into the jobAd.  Helpful for
        // the mpi shadow, at least...and a good idea in general.
	MyString tmp_addr = ATTR_MY_ADDRESS;
	tmp_addr += "=\"";
	tmp_addr += daemonCore->InfoCommandSinfulString();
	tmp_addr += '"';
    if ( !jobAd->Insert( tmp_addr.Value() )) {
        EXCEPT( "Failed to insert %s!", ATTR_MY_ADDRESS );
    }

	DebugId = display_dprintf_header;
	
	config();

		// Make sure we've got enough swap space to run
	checkSwap();

	// handle system calls with Owner's privilege
// XXX this belong here?  We'll see...
	// Calling init_user_ids() while in user priv causes badness.
	// Make sure we're in another priv state.
	set_condor_priv();
	if ( !init_user_ids(owner.Value(), domain.Value())) {
		dprintf(D_ALWAYS, "init_user_ids() failed as user %s\n",owner.Value() );
		// uids.C will EXCEPT when we set_user_priv() now
		// so there's not much we can do at this point
		
#if ! defined(WIN32)
		if ( param_boolean( "SHADOW_RUN_UNKNOWN_USER_JOBS", false ) )
		{
			dprintf(D_ALWAYS, "trying init_user_ids() as user nobody\n" );
			
			owner="nobody";
			domain=NULL;
			if (!init_user_ids(owner.Value(), domain.Value()))
			{
				dprintf(D_ALWAYS, "init_user_ids() failed!\n");
			}
			else
			{
				jobAd->Assign( ATTR_JOB_RUNAS_OWNER, "FALSE" );
				m_RunAsNobody=true;
				dprintf(D_ALWAYS, "init_user_ids() now running as user nobody\n");
			}
		}
#endif

	}
	set_user_priv();
	daemonCore->Register_Priv_State( PRIV_USER );

	dumpClassad( "BaseShadow::baseInit()", this->jobAd, D_JOB );

		// initialize the UserPolicy object
	shadow_user_policy.init( jobAd, this );

		// setup an object to keep our job ad updated to the schedd's
		// permanent job queue.  this clears all the dirty bits on our
		// copy of the classad, so anything we touch after this will
		// be updated to the schedd when appropriate.

		// Unless we got a command line arg asking us not to
	if (sendUpdatesToSchedd) {
		// the usual case
		job_updater = new QmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	} else {
		job_updater = new NullQmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	}

		// init user log; hold on failure
		// NOTE: job_updater must be initialized _before_ initUserLog(),
		// in order to handle the case of the job going on hold as a
		// result of failure in initUserLog().
	initUserLog();

		// change directory; hold on failure
	if ( cdToIwd() == -1 ) {
		EXCEPT("Could not cd to initial working directory");
	}

		// check to see if this invocation of the shadow is just to write
		// a terminate event and exit since this job had been recorded as
		// pending termination, but somehow the job didn't leave the queue
		// and the schedd is trying to restart it again..
	if( jobAd->LookupInteger(ATTR_TERMINATION_PENDING, pending)) {
		if (pending == TRUE) {
			// If the classad of this job "thinks" that this job should be
			// finished already, let's enact that belief.
			// This function does not return.
			this->terminateJob(US_TERMINATE_PENDING);
		}
	}

		// If we need to claim the startd before activating the claim
	int wantClaiming = 0;
	jobAd->LookupBool(ATTR_CLAIM_STARTD, wantClaiming);
	if (wantClaiming) {
		MyString startdSinful;
		MyString claimid;

			// Pull startd addr and claimid out of the jobad
		jobAd->LookupString(ATTR_STARTD_IP_ADDR, startdSinful);
		jobAd->LookupString(ATTR_CLAIM_ID, claimid);

		dprintf(D_ALWAYS, "%s is true, trying to claim startd %s\n", ATTR_CLAIM_STARTD, startdSinful.Value());

		classy_counted_ptr<DCStartd> startd = new DCStartd("description", NULL, startdSinful.Value(), claimid.Value());
	
		classy_counted_ptr<DCMsgCallback> cb = 
			new DCMsgCallback((DCMsgCallback::CppFunction)&BaseShadow::startdClaimedCB,
			this, jobAd);
																 
			// this can't fail, will always call the callback
		startd->asyncRequestOpportunisticClaim(jobAd, 
											   "description", 
											   daemonCore->InfoCommandSinfulString(), 
											   1200 /*alive interval*/, 
											   20 /* net timeout*/, 
											   100 /*total timeout*/, 
											   cb);
	}
}
Esempio n. 6
0
bool
JICLocal::initUserPriv( void )
{
	bool rval = false;

#ifdef WIN32
		/*
		  If we're on windoze, and this is anything but a local
		  universe job, we should immediately try the windoze-specific
		  method for per-slot users, setting up a nobody account, etc,
		  and be done with it.  However, if it's local universe, we
		  basically want to do what we do for the unix case: find
		  ATTR_OWNER (and ATTR_NT_DOMAIN) from the job ad and
		  initialize ourselves with that...
		*/
	if( job_universe != CONDOR_UNIVERSE_LOCAL ) {
		return initUserPrivWindows();
	}
#endif

		// Before we go through any trouble, see if we even need
		// ATTR_OWNER to initialize user_priv.  If not, go ahead and
		// initialize it as appropriate.  
	if( initUserPrivNoOwner() ) {
		return true;
	}

	char* owner = NULL;
	if( job_ad->LookupString( ATTR_OWNER, &owner ) != 1 ) {
		dprintf( D_ALWAYS, "ERROR: %s not found in JobAd.  Aborting.\n", 
				 ATTR_OWNER );
		return false;
	}

#ifdef WIN32
		// we only care about or expect to find this attribute if
		// we're on windoze...
	char* domain = NULL;
	if( job_ad->LookupString( ATTR_NT_DOMAIN, &domain ) != 1 ) {
		dprintf( D_ALWAYS, "ERROR: %s not found in JobAd.  Aborting.\n", 
				 ATTR_NT_DOMAIN );
		return false;
	}

	if( ! init_user_ids(owner,domain) ) { 
		dprintf( D_ALWAYS,
				 "ERROR: Bad or missing credential for user \"%s@%s\"\n",
				 owner, domain );
	} else {  
		rval = true;
		dprintf( D_FULLDEBUG, "Initialized user_priv as \"%s@%s\"\n", 
				 owner, domain );
	}
	if( domain ) {
		free( domain );
		domain = NULL;
	}

#else /* UNIX */

	if( ! init_user_ids_quiet(owner) ) { 
		dprintf( D_ALWAYS, "ERROR: Uid for \"%s\" not found in "
				 "passwd database for a local job\n", owner ); 
	} else {
		CondorPrivSepHelper* psh = Starter->condorPrivSepHelper();
		if (psh != NULL) {
			psh->initialize_user(owner);
		}
		rval = true;
		dprintf( D_FULLDEBUG, "Initialized user_priv as \"%s\"\n", 
				 owner );
	}

#endif

		// deallocate owner string so we don't leak memory.
	free( owner );
	owner = NULL;

	if( rval ) {
		user_priv_is_initialized = true;
	}
	return rval;
}
Esempio n. 7
0
void
HandleSyscalls()
{
	register int	cnt;
	fd_set 			readfds;
	int 			nfds = -1;

	time_t			periodic_interval_len = 20; /* secs, empirically found :) */

	nfds = (RSC_SOCK > CLIENT_LOG ) ? (RSC_SOCK + 1) : (CLIENT_LOG + 1);

	init_user_ids(Proc->owner, NULL);
	set_user_priv();

	dprintf(D_FULLDEBUG, "HandleSyscalls: about to chdir(%s)\n", Proc->iwd);
	if( chdir(Proc->iwd) < 0 ) {
		sprintf( ErrBuf,  "Can't chdir() to \"%s\"! [%s(%d)]", Proc->iwd, 
			strerror(errno), errno );
		HadErr = TRUE;
		return;
	}

	dprintf(D_SYSCALLS, "Shadow: Starting to field syscall requests\n");
	errno = 0;

	time_t current_time = time(0);
	time_t next_periodic_update = current_time + periodic_interval_len;
	
	for(;;) {	/* get a request and fulfill it */

		FD_ZERO(&readfds);
		FD_SET(RSC_SOCK, &readfds);
		FD_SET(CLIENT_LOG, &readfds);

		struct timeval *ptimer = NULL, timer;
		timer.tv_sec = next_periodic_update - current_time;
		timer.tv_usec = 0;
		ptimer = &timer;
		/* if the current timer is set for a time longer than this, than
			truncate the timer required to the periodic limit. After 
			inspection of the bandwidth timer, it seems that it will recorrect
			itself if select comes out of the loop before the timer goes off
			anyway to handle syscalls */
		if ( timer.tv_sec > periodic_interval_len) {
			timer.tv_sec = next_periodic_update - current_time;
			ptimer = &timer;
		}

		unblock_signal(SIGCHLD);
		unblock_signal(SIGUSR1);
#if defined(LINUX) || defined(Solaris)
		cnt = select(nfds, &readfds, (fd_set *)0, (fd_set *)0, ptimer);
#else
		cnt = select(nfds, &readfds, 0, 0, ptimer);
#endif
		block_signal(SIGCHLD);
		block_signal(SIGUSR1);

		if( cnt < 0 && errno != EINTR ) {
			EXCEPT("HandleSyscalls: select: errno=%d, rsc_sock=%d, client_log=%d",errno,RSC_SOCK,CLIENT_LOG);
		}

		if( cnt < 0 && errno == EINTR ) {
			continue;
		}

		if( FD_ISSET(CLIENT_LOG, &readfds) ) {
			if( HandleLog() < 0 ) {
				EXCEPT( "Peer went away" );
			}
		}

		if( FD_ISSET(RSC_SOCK, &readfds) ) {
			if( do_REMOTE_syscall() < 0 ) {
				dprintf(D_SYSCALLS,
						"Shadow: do_REMOTE_syscall returned < 0\n");
				break;
			}
		}

		if( FD_ISSET(UMBILICAL, &readfds) ) {
			dprintf(D_ALWAYS,
				"Shadow: Local scheduler apparently died, so I die too\n");
			exit(1);
		}

		current_time = time(0);

		/* if this is true, then do the periodic_interval_len events */
		if (current_time >= next_periodic_update) {
			next_periodic_update = current_time + periodic_interval_len;

			/* evaluate some attributes for policies like determining what to
			do if a job suspends wierdly or some such thing. This function
			has the possibility of making the shadow exit with JOB_SHOULD_HOLD
			or futzing up some global variables about how the job could've
			exited and letting Wraup take care of it. */
			if (periodic_policy() == true)
			{
				break;
			}
		}

#if defined(SYSCALL_DEBUG)
		strcpy( SyscallLabel, "shadow" );
#endif
	}

	/*
	The user job might exit while there is still unread data in the log.
	So, select with a timeout of zero, and flush everything from the log.
	*/
		/* 
		   NOTE: Since HandleLog does it's own loop to make sure it's
		   read everything, we don't need a loop here, and should only
		   call HandleLog once.  In fact, if there's a problem w/
		   select(), a loop here can cause an infinite loop.  
		   -Derek Wright and Jim Basney, 2/17/99.
		*/
	HandleLog();
	
		/* Take back normal condor privileges */
	set_condor_priv();

		/* If we are debugging with named pipes as our communications medium,
		   won't have a condor_startd running - don't try to send to it.
		*/
	if( !UsePipes ) {
		send_quit( ExecutingHost, GlobalCap );
	}

	dprintf(D_ALWAYS,
		"Shadow: Job %d.%d exited, termsig = %d, coredump = %d, retcode = %d\n",
			Proc->id.cluster, Proc->id.proc, WTERMSIG(JobStatus),
			WCOREDUMP(JobStatus), WEXITSTATUS(JobStatus));
}
Esempio n. 8
0
bool
JobInfoCommunicator::initUserPrivWindows( void )
{
    // Win32
    // taken origionally from OsProc::StartJob.  Here we create the
    // user and initialize user_priv.

    // By default, assume execute login may be shared by other processes.
    setExecuteAccountIsDedicated( NULL );

    // we support running the job as other users if the user
    // is specifed in the config file, and the account's password
    // is properly stored in our credential stash.

    char *name = NULL;
    char *domain = NULL;
    bool init_priv_succeeded = true;
    bool run_as_owner = allowRunAsOwner( false, false );

    // TODO..
    // Currently vmgahp for VMware VM universe can't run as user on Windows.
    // It seems like a bug of VMware. VMware command line tool such as "vmrun"
    // requires Administrator privilege.
    // So here we set name and domain with my_username and my_domainname
    // -jaeyoung 06/15/07
    if( job_universe == CONDOR_UNIVERSE_VM ) {
#if 0
        // If "VM_UNIV_NOBODY_USER" is defined in Condor configuration file,
        // wee will use it.
        char *vm_jobs_as = param("VM_UNIV_NOBODY_USER");
        if (vm_jobs_as) {
            getDomainAndName(vm_jobs_as, domain, name);
            /*
             * name and domain are now just pointers into vm_jobs_as
             * buffer.  copy these values into their own buffer so we
             * deallocate below.
             */
            if ( name ) {
                name = strdup(name);
            }
            if ( domain ) {
                domain = strdup(domain);
            }
            free(vm_jobs_as);
        }
#endif
        MyString vm_type;
        job_ad->LookupString(ATTR_JOB_VM_TYPE, vm_type);

        if( strcasecmp(vm_type.Value(), CONDOR_VM_UNIVERSE_VMWARE) == MATCH ) {
            name = my_username();
            domain = my_domainname();
        }
    }

    if( !name ) {
        if ( run_as_owner ) {
            job_ad->LookupString(ATTR_OWNER,&name);
            job_ad->LookupString(ATTR_NT_DOMAIN,&domain);
        }
    }

    if ( !name ) {
        char slot_user[255];
        MyString slotName = "";
        slotName = Starter->getMySlotName();

        slotName.upper_case();
        sprintf(slot_user, "%s_USER", slotName);
        char *run_jobs_as = param(slot_user);
        if (run_jobs_as) {
            getDomainAndName(run_jobs_as, domain, name);
            /*
             * name and domain are now just pointers into run_jobs_as
             * buffer.  copy these values into their own buffer so we
             * deallocate below.
             */
            if ( name ) {
                name = strdup(name);
            }
            if ( domain ) {
                domain = strdup(domain);
            }
            free(run_jobs_as);
        }
    }

    if ( name ) {

        if (!init_user_ids(name, domain)) {

            dprintf(D_ALWAYS, "Could not initialize user_priv as \"%s\\%s\".\n"
                    "\tMake sure this account's password is securely stored "
                    "with condor_store_cred.\n", domain, name );
            init_priv_succeeded = false;
        }
        else {
            MyString login_name;
            joinDomainAndName(name, domain, login_name);
            if( checkDedicatedExecuteAccounts( login_name.Value() ) ) {
                setExecuteAccountIsDedicated( login_name.Value() );
            }
        }

    } else if ( !can_switch_ids() ) {
        char *u = my_username();
        char *d = my_domainname();

        if ( !init_user_ids(u, d) ) {
            // shouldn't happen - we always can get our own token
            dprintf(D_ALWAYS, "Could not initialize user_priv with our own token!\n");
            init_priv_succeeded = false;
        }
        free(u);
        free(d);
    } else if( init_user_ids("nobody", ".") ) {
        // just init a new nobody user; dynuser handles the rest.
        // the "." means Local Machine to LogonUser

        setExecuteAccountIsDedicated( get_user_loginname() );
    }
    else {

        dprintf( D_ALWAYS, "ERROR: Could not initialize user_priv "
                 "as \"nobody\"\n" );
        init_priv_succeeded = false;

    }

    if ( name ) free(name);
    if ( domain ) free(domain);

    user_priv_is_initialized = init_priv_succeeded;
    return init_priv_succeeded;
}
Esempio n. 9
0
GridUniverseLogic::gman_node_t *
GridUniverseLogic::StartOrFindGManager(const char* owner, const char* domain,
	   	const char* attr_value, const char* attr_name, int cluster, int proc)
{
	gman_node_t* gman_node;
	int pid;

		// If attr_value is an empty string, convert to NULL since code
		// after this point expects that.
	if ( attr_value && strlen(attr_value)==0 ) {
		attr_value = NULL;
		attr_name = NULL;
	}

	if ( (gman_node=lookupGmanByOwner(owner, attr_value, cluster, proc)) ) {
		// found it
		return gman_node;
	}

	// not found.  fire one up!  we want to run the GManager as the user.

	// but first, make certain we are not shutting down...
	if (!gman_pid_table) {
		// destructor has already been called; we are probably
		// closing down.
		return NULL;
	}


#ifndef WIN32
	if (owner && strcasecmp(owner, "root") == 0 ) {
		dprintf(D_ALWAYS, "Tried to start condor_gmanager as root.\n");
		return NULL;
	}
#endif

	dprintf( D_FULLDEBUG, "Starting condor_gmanager for owner %s (%d.%d)\n",
			owner, cluster, proc);

	char *gman_binary;
	gman_binary = param("GRIDMANAGER");
	if ( !gman_binary ) {
		dprintf(D_ALWAYS,"ERROR - GRIDMANAGER not defined in config file\n");
		return NULL;
	}

	ArgList args;
	MyString error_msg;

	args.AppendArg("condor_gridmanager");
	args.AppendArg("-f");

	char *gman_args = param("GRIDMANAGER_ARGS");

	if(!args.AppendArgsV1RawOrV2Quoted(gman_args,&error_msg)) {
		dprintf( D_ALWAYS, "ERROR: failed to parse gridmanager args: %s\n",
				 error_msg.Value());
		free(gman_binary);
		free(gman_args);
		return NULL;
	}
	free(gman_args);

	// build a constraint
	if ( !owner ) {
		dprintf(D_ALWAYS,"ERROR - missing owner field\n");
		free(gman_binary);
		return NULL;
	}
	MyString constraint;
	if ( !attr_name  ) {
		constraint.formatstr("(%s=?=\"%s\"&&%s==%d)",
						   ATTR_OWNER,owner,
						   ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID);
	} else {
		constraint.formatstr("(%s=?=\"%s\"&&%s=?=\"%s\"&&%s==%d)",
						   ATTR_OWNER,owner,
						   attr_name,attr_value,
						   ATTR_JOB_UNIVERSE,CONDOR_UNIVERSE_GRID);

		args.AppendArg("-A");
		args.AppendArg(attr_value);
	}
	args.AppendArg("-C");
	args.AppendArg(constraint.Value());

	MyString full_owner_name(owner);
	if ( domain && *domain ) {
		full_owner_name.formatstr_cat( "@%s", domain );
	}
	args.AppendArg("-o");
	args.AppendArg(full_owner_name.Value());

	if (!init_user_ids(owner, domain)) {
		dprintf(D_ALWAYS,"ERROR - init_user_ids() failed in GRIDMANAGER\n");
		free(gman_binary);
		return NULL;
	}

	static bool first_time_through = true;
	if ( first_time_through ) {
		// Note: Because first_time_through is static, this block runs only 
		// once per schedd invocation.
		first_time_through = false;

		// Clean up any old / abandoned scratch dirs.
		dprintf(D_FULLDEBUG,"Checking for old gridmanager scratch dirs\n");
		char *prefix = temp_dir_path();
		ASSERT(prefix);
		Directory tmp( prefix, PRIV_USER );
		const char *f;
		char const *dot;
		int fname_pid;
		int mypid = daemonCore->getpid();
		int scratch_pre_len = strlen(scratch_prefix);
		while ( (f=tmp.Next()) ) {
				// skip regular files -- we only need to inspect subdirs
			if ( !tmp.IsDirectory() ) {
				continue;
			}
				// skip if it does not start with our prefix
			if ( strncmp(scratch_prefix,f,scratch_pre_len) ) {
				continue;
			}
				// skip if does not end w/ a pid
			dot = strrchr(f,'.');
			if ( !dot ) {
				continue;
			}
				// skip if this pid is still alive and not ours
			dot++;	// skip over period
			fname_pid = atoi(dot);
			if ( fname_pid != mypid && daemonCore->Is_Pid_Alive(fname_pid) ) {
					continue;
			}
				// if we made it here, blow away this subdir
			if ( tmp.Remove_Current_File() ) {
				dprintf(D_ALWAYS,"Removed old scratch dir %s\n",
				tmp.GetFullPath());
			}
		}	// end of while for cleanup of old scratch dirs

		dprintf(D_FULLDEBUG,"Done checking for old scratch dirs\n");			

		if (prefix != NULL) {
			free(prefix);
			prefix = NULL;
		}

	}	// end of once-per-schedd invocation block

	// Create a temp dir for the gridmanager and append proper
	// command-line arguments to tell where it is.
	bool failed = false;
	gman_node = new gman_node_t;
	char *finalpath = scratchFilePath(gman_node);
	priv_state saved_priv = set_user_priv();
	if ( (mkdir(finalpath,0700)) < 0 ) {
		// mkdir failed.  
		dprintf(D_ALWAYS,"ERROR - mkdir(%s,0700) failed in GRIDMANAGER, errno=%d (%s)\n",
				finalpath, errno, strerror(errno));
		failed = true;
	}
	set_priv(saved_priv);
	uninit_user_ids();
	args.AppendArg("-S");	// -S = "ScratchDir" argument
	args.AppendArg(finalpath);
	delete [] finalpath;
	if ( failed ) {
		// we already did dprintf reason to the log...
		free(gman_binary);
		delete gman_node;
		return NULL;
	}

	if(IsFulldebug(D_FULLDEBUG)) {
		MyString args_string;
		args.GetArgsStringForDisplay(&args_string);
		dprintf(D_FULLDEBUG,"Really Execing %s\n",args_string.Value());
	}

	pid = daemonCore->Create_Process( 
		gman_binary,			// Program to exec
		args,					// Command-line args
		PRIV_ROOT,				// Run as root, so it can switch to
		                        //   PRIV_CONDOR
		rid						// Reaper ID
		);

	free(gman_binary);

	if ( pid <= 0 ) {
		dprintf ( D_ALWAYS, "StartOrFindGManager: Create_Process problems!\n" );
		if (gman_node) delete gman_node;
		return NULL;
	}

	// If we made it here, we happily started up a new gridmanager process

	dprintf( D_ALWAYS, "Started condor_gmanager for owner %s pid=%d\n",
			owner,pid);

	// Make a new gman_node entry for our hashtable & insert it
	if ( !gman_node ) {
		gman_node = new gman_node_t;
	}
	gman_node->pid = pid;
	gman_node->owner[0] = '\0';
	gman_node->domain[0] = '\0';
	if ( owner ) {
		strcpy(gman_node->owner,owner);
	}
	if ( domain ) {
		strcpy(gman_node->domain,domain);
	}
	MyString owner_key(owner);
	if(attr_value){
		owner_key += attr_value;
	}
	if (cluster) {
		owner_key.formatstr_cat( "-%d.%d", cluster, proc );
	}

	ASSERT( gman_pid_table->insert(owner_key,gman_node) == 0 );

	// start timer to signal gridmanager if we haven't already
	if ( gman_node->add_timer_id == -1 ) {  // == -1 means no timer set
		gman_node->add_timer_id = daemonCore->Register_Timer(job_added_delay,
			GridUniverseLogic::SendAddSignal,
			"GridUniverseLogic::SendAddSignal");
		daemonCore->Register_DataPtr(gman_node);
	}

	// All done
	return gman_node;
}
Esempio n. 10
0
int 
GridUniverseLogic::GManagerReaper(Service *,int pid, int exit_status)
{
	gman_node_t* gman_node = NULL;
	MyString owner;

	// Iterate through our table to find the node w/ this pid
	// Someday we should perhaps also hash on the pid, but we
	// don't expect gridmanagers to exit very often, and there
	// are not that many of them.

	if (gman_pid_table) {
		gman_node_t* tmpnode;
		gman_pid_table->startIterations();
		while ( gman_pid_table->iterate(owner,tmpnode) ) {
			if (tmpnode->pid == pid ) {
				// found it!
				gman_node = tmpnode;
				break;
			}
		}
	}

	MyString owner_safe;
	MyString exit_reason;
	if(gman_node) { owner_safe = owner; }
	else { owner_safe = "Unknown"; }
	if ( WIFEXITED( exit_status ) ) {
		exit_reason.formatstr( "with return code %d",
							 WEXITSTATUS( exit_status ) );
	} else {
		exit_reason.formatstr( "due to %s",
							 daemonCore->GetExceptionString( exit_status ) );
	}
	dprintf(D_ALWAYS, "condor_gridmanager (PID %d, owner %s) exited %s.\n",
			pid, owner_safe.Value(), exit_reason.Value() );
	if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR) {
		const char *condorUserName = get_condor_username();

		dprintf(D_ALWAYS, 
			"The gridmanager had a problem writing its log. "
			"Check the permissions of the file specified by GRIDMANAGER_LOG; "
			"it needs to be writable by Condor.\n");

			/* send email to the admin about this, but only
			 * every six hours - enough to not be ignored, but
			 * not enough to be a pest.  If only my children were
			 * so helpful and polite.  Ah, well, we can always dream...
			 */
		static time_t last_email_re_gridmanlog = 0;
		if ( time(NULL) - last_email_re_gridmanlog > 6 * 60 * 60 ) {
			last_email_re_gridmanlog = time(NULL);
			FILE *email = email_admin_open("Unable to launch grid universe jobs.");
			if ( email ) {
				fprintf(email,
					"The condor_gridmanager had an error writing its log file.  Check the  \n"
					"permissions/ownership of the file specified by the GRIDMANAGER_LOG setting in \n"
					"the condor_config file.  This file needs to be writable as user %s to enable\n"
					"the condor_gridmanager daemon to write to it. \n\n"
					"Until this problem is fixed, grid universe jobs submitted from this machine cannot "
					"be launched.\n", condorUserName ? condorUserName : "******" );
				email_close(email);
			} else {
					// Error sending an email message
				dprintf(D_ALWAYS,"ERROR: Cannot send email to the admin\n");
			}
		}	
	}	// end if(WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == DPRINTF_ERROR)

	if (!gman_node) {
		// nothing more to do, so return
		return 0;
	}

	// Cancel any timers before removing the node!!
	if (gman_node->add_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->add_timer_id);
	}
	if (gman_node->remove_timer_id != -1) {
		daemonCore->Cancel_Timer(gman_node->remove_timer_id);
	}
	// Remove node from our hash table
	gman_pid_table->remove(owner);
	// Remove any scratch directory used by this gridmanager
	char *scratchdir = scratchFilePath(gman_node);
	ASSERT(scratchdir);
	if ( IsDirectory(scratchdir) && 
		 init_user_ids(gman_node->owner, gman_node->domain) ) 
	{
		priv_state saved_priv = set_user_priv();
			// Must put this in braces so the Directory object
			// destructor is called, which will free the iterator
			// handle.  If we didn't do this, the below rmdir 
			// would fail.
		{
			Directory tmp( scratchdir );
			tmp.Remove_Entire_Directory();
		}
		if ( rmdir(scratchdir) == 0 ) {
			dprintf(D_FULLDEBUG,"Removed scratch dir %s\n",scratchdir);
		} else {
			dprintf(D_FULLDEBUG,"Failed to remove scratch dir %s\n",
					scratchdir);
		}
		set_priv(saved_priv);
		uninit_user_ids();
	}
	delete [] scratchdir;

	// Reclaim memory from the node itself
	delete gman_node;

	return 0;
}