Exemplo n.º 1
0
void
BaseShadow::publishShadowAttrs( ClassAd* ad )
{
	MyString tmp;
	tmp = ATTR_SHADOW_IP_ADDR;
	tmp += "=\"";
	tmp += daemonCore->InfoCommandSinfulString();
    tmp += '"';
	ad->Insert( tmp.Value() );

	tmp = ATTR_SHADOW_VERSION;
	tmp += "=\"";
	tmp += CondorVersion();
    tmp += '"';
	ad->Insert( tmp.Value() );

	char* my_uid_domain = param( "UID_DOMAIN" );
	if( my_uid_domain ) {
		tmp = ATTR_UID_DOMAIN;
		tmp += "=\"";
		tmp += my_uid_domain;
		tmp += '"';
		ad->Insert( tmp.Value() );
		free( my_uid_domain );
	}
}
int condor__getVersionString(struct soap *soap,void *,char* &result)
{
	const char *version = CondorVersion();
	result = (char *) soap_malloc(soap, strlen(version) + 1);
	ASSERT(result);
	strcpy(result, version);
	return SOAP_OK;
}
Exemplo n.º 3
0
bool
JICLocalSchedd::getLocalJobAd( void )
{ 
	if( ! JICLocalFile::getLocalJobAd() ) {
		return false;
	}
	job_updater = new QmgrJobUpdater( job_ad, schedd_addr, CondorVersion() );
	return true;
}
Exemplo n.º 4
0
void
printClassAd( void )
{
	printf( "%s = False\n", ATTR_IS_DAEMON_CORE );
	printf( "%s = True\n", ATTR_HAS_REMOTE_SYSCALLS );
	printf( "%s = True\n", ATTR_HAS_CHECKPOINTING );
	printf( "%s = True\n", ATTR_HAS_OLD_VANILLA );
	printf( "%s = True\n", ATTR_HAS_JOB_AD_FROM_FILE );
	printf( "%s = \"%s\"\n", ATTR_VERSION, CondorVersion() );
}
Exemplo n.º 5
0
void
printClassAd( void )
{
    printf( "%s = True\n", ATTR_IS_DAEMON_CORE );
    printf( "%s = True\n", ATTR_HAS_FILE_TRANSFER );
    printf( "%s = True\n", ATTR_HAS_PER_FILE_ENCRYPTION );
    printf( "%s = True\n", ATTR_HAS_MPI );
    printf( "%s = True\n", ATTR_HAS_JAVA );
    printf( "%s = True\n", ATTR_HAS_RECONNECT );
    printf( "%s = True\n", ATTR_HAS_JOB_AD_FROM_FILE );
    printf( "%s = True\n", ATTR_HAS_VM );
    printf( "%s = \"%s\"\n", ATTR_VERSION, CondorVersion() );
}
Exemplo n.º 6
0
// using a constraint for which the schedd must iterate over all the jobads.
bool 
DCSchedd::requestSandboxLocation(int direction, MyString &constraint, 
	int protocol, ClassAd *respad, CondorError * errstack)
{
	ClassAd reqad;

	////////////////////////////////////////////////////////////////////////
	// This request specifies a collection of job ads as defined by a
	// constraint. Later, then the transfer actually happens to the transferd,
	// this constraint will get converted to actual job ads at that time and
	// the client will have to get them back so it knows what to send.
	////////////////////////////////////////////////////////////////////////

	reqad.Assign(ATTR_TREQ_DIRECTION, direction);
	reqad.Assign(ATTR_TREQ_PEER_VERSION, CondorVersion());
	reqad.Assign(ATTR_TREQ_HAS_CONSTRAINT, true);
	reqad.Assign(ATTR_TREQ_CONSTRAINT, constraint.Value());

	// XXX This should be a realized function to convert between the
	// protocol enum and a string description. That way both functions can
	// use it and it won't seem so bad.
	switch(protocol) {
		case FTP_CFTP:
			reqad.Assign(ATTR_TREQ_FTP, FTP_CFTP);
			break;
		default:
			dprintf(D_ALWAYS, "DCSchedd::requestSandboxLocation(): "
				"Can't make a request for a sandbox with an unknown file "
				"transfer protocol!");
			return false;
			break;
	}

	// now connect to the schedd and get the response.
	return requestSandboxLocation(&reqad, respad, errstack);
}
void
XferSummary::time_out(time_t now, char *hostaddr)
{
	ClassAd	   	info;
	char		line[128], *tmp;
	char		*str = NULL;

	info.SetMyTypeName("CkptServer");
	info.SetTargetTypeName("CkptFile");

	sprintf(line, "%s = \"%s\"", ATTR_NAME, my_full_hostname() );
	info.Insert(line);
    sprintf(line, "%s = \"%s\"", ATTR_MACHINE, hostaddr );
	info.Insert(line);
	sprintf(line, "%s = \"%s\"", ATTR_VERSION, CondorVersion() );
	info.Insert(line);
	sprintf(line, "%s = \"%s\"", ATTR_PLATFORM, CondorPlatform() );
	info.Insert(line);
	sprintf(line, "NumSends = %d", num_sends);
	info.Insert(line);
	sprintf(line, "BytesSent = %d", (int) bytes_sent);
	info.Insert(line);
	sprintf(line, "TimeSending = %d", time_sending);
	info.Insert(line);
	sprintf(line, "AvgSendBandwidth = %f", num_sends ?
			tot_send_bandwidth / num_sends : 0.0);
	info.Insert(line);
	sprintf(line, "NumRecvs = %d", num_recvs);
	info.Insert(line);
	sprintf(line, "BytesReceived = %d", (int) bytes_recv);
	info.Insert(line);
	sprintf(line, "TimeReceiving = %d", time_recving);
	info.Insert(line);
	sprintf(line, "AvgReceiveBandwidth = %f", num_recvs ?
			tot_recv_bandwidth / num_recvs : 0.0);
	info.Insert(line);

	/* ctime adds a newline at the end of the ascii conversion.... */
	str = ctime(&start_time);
	sprintf(line, "CkptServerIntervalStart = \"%s\"", str ? str : "Unknown\n");
	tmp = strchr( line, '\n' );
	if (tmp != NULL) {
		/* delete the newline */
		*tmp = '\"';
		tmp++;
		*tmp = '\0';
	}
	info.Insert(line);

	/* ctime adds a newline at the end of the ascii conversion.... */
	str = ctime(&now);
	sprintf(line, "CkptServerIntervalEnd = \"%s\"", str ? str : "Unknown\n");
	tmp = strchr( line, '\n' );
	if (tmp != NULL) {
		/* delete the newline */
		*tmp = '\"';
		tmp++;
		*tmp = '\0';
	}
	info.Insert(line);

	sprintf(line, "Disk = %d", sysapi_disk_space(pwd.Value()) );
	info.Insert(line);
	
	// Send to collector
	if ( Collectors ) {
        dprintf(D_NETWORK, "Sending CkptServer ClassAd:\n");
        info.dPrint(D_NETWORK);
		Collectors->sendUpdates (UPDATE_CKPT_SRVR_AD, &info, NULL, true);
	}

	init();
}
Exemplo n.º 8
0
void
doContactSchedd()
{
	int rc;
	Qmgr_connection *schedd;
	BaseJob *curr_job;
	ClassAd *next_ad;
	char expr_buf[12000];
	bool schedd_updates_complete = false;
	bool schedd_deletes_complete = false;
	bool add_remove_jobs_complete = false;
	bool update_jobs_complete = false;
	bool commit_transaction = true;
	int failure_line_num = 0;
	bool send_reschedule = false;
	std::string error_str = "";
	StringList dirty_job_ids;
	char *job_id_str;
	PROC_ID job_id;
	CondorError errstack;

	dprintf(D_FULLDEBUG,"in doContactSchedd()\n");

	initJobExprs();

	contactScheddTid = TIMER_UNSET;

	// vacateJobs
	/////////////////////////////////////////////////////
	if ( pendingScheddVacates.getNumElements() != 0 ) {
		std::string buff;
		StringList job_ids;
		VacateRequest curr_request;

		int result;
		ClassAd* rval;

		pendingScheddVacates.startIterations();
		while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
			formatstr( buff, "%d.%d", curr_request.job->procID.cluster,
						  curr_request.job->procID.proc );
			job_ids.append( buff.c_str() );
		}

		char *tmp = job_ids.print_to_string();
		if ( tmp ) {
			dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp );
			free(tmp);
			tmp = NULL;
		}

		rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack );
		if ( rval == NULL ) {
			formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!",
							   errstack.getFullText().c_str() );
			goto contact_schedd_failure;
		} else {
			pendingScheddVacates.startIterations();
			while ( pendingScheddVacates.iterate( curr_request ) != 0 ) {
				formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster,
							  curr_request.job->procID.proc );
				if ( !rval->LookupInteger( buff.c_str(), result ) ) {
					dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" );
					EXCEPT( "vacateJobs returned malformed ad" );
				} else {
					dprintf( D_FULLDEBUG, "   %d.%d vacate result: %d\n",
							 curr_request.job->procID.cluster,
							 curr_request.job->procID.proc,result);
					pendingScheddVacates.remove( curr_request.job->procID );
					curr_request.result = (action_result_t)result;
					curr_request.job->SetEvaluateState();
					completedScheddVacates.insert( curr_request.job->procID,
												   curr_request );
				}
			}
			delete rval;
		}
	}


	schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() );
	if ( !schedd ) {
		error_str = "Failed to connect to schedd!";
		goto contact_schedd_failure;
	}


	// CheckLeases
	/////////////////////////////////////////////////////
	if ( checkLeasesSignaled ) {

		dprintf( D_FULLDEBUG, "querying for renewed leases\n" );

		// Grab the lease attributes of all the jobs in our global hashtable.

		BaseJob::JobsByProcId.startIterations();

		while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) {
			int new_expiration;

			rc = GetAttributeInt( curr_job->procID.cluster,
								  curr_job->procID.proc,
								  ATTR_TIMER_REMOVE_CHECK,
								  &new_expiration );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// This job doesn't have doesn't have a lease from
						// the submitter. Skip it.
					continue;
				}
			}
			curr_job->UpdateJobLeaseReceived( new_expiration );
		}

		checkLeasesSignaled = false;
	}	// end of handling check leases


	// AddJobs
	/////////////////////////////////////////////////////
	if ( addJobsSignaled || firstScheddContact ) {
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for new jobs\n" );

		// Make sure we grab all Globus Universe jobs (except held ones
		// that we previously indicated we were done with)
		// when we first start up in case we're recovering from a
		// shutdown/meltdown.
		// Otherwise, grab all jobs that are unheld and aren't marked as
		// currently being managed and aren't marked as not matched.
		// If JobManaged is undefined, equate it with false.
		// If Matched is undefined, equate it with true.
		// NOTE: Schedds from Condor 6.6 and earlier don't include
		//   "(Universe==9)" in the constraint they give to the gridmanager,
		//   so this gridmanager will pull down non-globus-universe ads,
		//   although it won't use them. This is inefficient but not
		//   incorrect behavior.
		if ( firstScheddContact ) {
			// Grab all jobs for us to manage. This expression is a
			// derivative of the expression below for new jobs. We add
			// "|| Managed =?= TRUE" to also get jobs our previous
			// incarnation was in the middle of managing when it died
			// (if it died unexpectedly). With the new term, the
			// "&& Managed =!= TRUE" from the new jobs expression becomes
			// superfluous (by boolean logic), so we drop it.
			sprintf( expr_buf,
					 "%s && %s && ((%s && %s) || %s)",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_managed.c_str()
					 );
		} else {
			// Grab new jobs for us to manage
			sprintf( expr_buf,
					 "%s && %s && %s && %s && %s",
					 expr_schedd_job_constraint.c_str(), 
					 expr_not_completely_done.c_str(),
					 expr_matched_or_undef.c_str(),
					 expr_not_held.c_str(),
					 expr_not_managed.c_str()
					 );
		}
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *old_job;
			int job_is_matched = 1;		// default to true if not in ClassAd

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			bool job_is_managed = jobExternallyManaged(next_ad);
			next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched);

			if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) {

				JobType *job_type = NULL;
				BaseJob *new_job = NULL;

				// job had better be either managed or matched! (or both)
				ASSERT( job_is_managed || job_is_matched );

				if ( MustExpandJobAd( next_ad ) ) {
					// Get the expanded ClassAd from the schedd, which
					// has the GridResource filled in with info from
					// the matched ad.
					delete next_ad;
					next_ad = NULL;
					next_ad = GetJobAd(procID.cluster,procID.proc);
					if ( next_ad == NULL && errno == ETIMEDOUT ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
					if ( next_ad == NULL ) {
						// We may get here if it was not possible to expand
						// one of the $$() expressions.  We don't want to
						// roll back the transaction and blow away the
						// hold that the schedd just put on the job, so
						// simply skip over this ad.
						dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d.  errno=%d\n",procID.cluster,procID.proc,errno);
						goto contact_schedd_next_add_job;
					}
				}

				// Search our job types for one that'll handle this job
				jobTypes.Rewind();
				while ( jobTypes.Next( job_type ) ) {
					if ( job_type->AdMatchFunc( next_ad ) ) {

						// Found one!
						dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n",
								 job_type->Name, procID.cluster, procID.proc );
						break;
					}
				}

				if ( job_type != NULL ) {
					new_job = job_type->CreateFunc( next_ad );
				} else {
					dprintf( D_ALWAYS, "No handlers for job %d.%d\n",
							 procID.cluster, procID.proc );
					new_job = new BaseJob( next_ad );
				}

				ASSERT(new_job);
				new_job->SetEvaluateState();
				dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n",
						new_job->procID.cluster,new_job->procID.proc);
				num_ads++;

				if ( !job_is_managed ) {
					rc = tSetAttributeString( new_job->procID.cluster,
									   new_job->procID.proc,
									   ATTR_JOB_MANAGED,
									   MANAGED_EXTERNAL);
					if ( rc < 0 ) {
						failure_line_num = __LINE__;
						commit_transaction = false;
						goto contact_schedd_disconnect;
					}
				}

			} else {

				// We already know about this job, skip
				// But also set Managed=true on the schedd so that it won't
				// keep signalling us about it
				delete next_ad;
				rc = tSetAttributeString( procID.cluster, procID.proc,
								   ATTR_JOB_MANAGED, MANAGED_EXTERNAL );
				if ( rc < 0 ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				}

			}

contact_schedd_next_add_job:
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}	// end of while next_ad
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads);
	}	// end of handling add jobs


	// RemoveJobs
	/////////////////////////////////////////////////////

	// We always want to perform this check. Otherwise, we may overwrite a
	// REMOVED/HELD/COMPLETED status with something else below.
	{
		int num_ads = 0;

		dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" );

		// Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we
		// haven't previously indicated that we're done with (by setting
		// JobManaged to "Schedd".
		sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))",
				 ScheddJobConstraint, expr_not_completely_done.c_str(),
				 ATTR_JOB_STATUS, REMOVED,
				 ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD,
				 ATTR_JOB_MANAGED, MANAGED_EXTERNAL );

		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			PROC_ID procID;
			BaseJob *next_job;
			int curr_status;

			next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, procID.proc );
			next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status );

			if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) {
				// Should probably skip jobs we already have marked as
				// held or removed

				next_job->JobAdUpdateFromSchedd( next_ad, true );
				num_ads++;

			} else if ( curr_status == REMOVED ) {

				// If we don't know about the job, act like we got an
				// ADD_JOBS signal from the schedd the next time we
				// connect, so that we'll create a Job object for it
				// and decide how it needs to be handled.
				// TODO The AddJobs and RemoveJobs queries shoule be
				//   combined into a single query.
				dprintf( D_ALWAYS, 
						 "Don't know about removed job %d.%d. "
						 "Will treat it as a new job to manage\n",
						 procID.cluster, procID.proc );
				addJobsSignaled = true;

			} else {

				dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. "
						 "Ignoring it\n",
						 procID.cluster, procID.proc );

			}

			delete next_ad;
			next_ad = GetNextJobByConstraint( expr_buf, 0 );
		}
		if ( errno == ETIMEDOUT ) {
			failure_line_num = __LINE__;
			commit_transaction = false;
			goto contact_schedd_disconnect;
		}

		dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads);
	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	add_remove_jobs_complete = true;


	// Retrieve dirty attributes
	/////////////////////////////////////////////////////
	if ( updateJobsSignaled ) {
		dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" );

		sprintf( expr_buf, "%s && %s && %s && %s",
				 expr_schedd_job_constraint.c_str(), 
				 expr_not_completely_done.c_str(),
				 expr_not_held.c_str(),
				 expr_managed.c_str()
				 );
		dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf);
		next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 );
		while ( next_ad != NULL ) {
			ClassAd updates;
			char str[PROC_ID_STR_BUFLEN];
			next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster );
			next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc );
			if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) {
				dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc );
				failure_line_num = __LINE__;
				delete next_ad;
				goto contact_schedd_disconnect;
		        }
			else {
				dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc);
				dPrintAd(D_JOB, updates);
			}
			if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
				curr_job->JobAdUpdateFromSchedd( &updates, false );
				ProcIdToStr( job_id, str );
				dirty_job_ids.append( str );
			}
			else {
				dprintf( D_ALWAYS, "Don't know about updated job %d.%d. "
						 "Ignoring it\n",
						 job_id.cluster, job_id.proc );
			}
			delete next_ad;
			next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 );
		}
	}
	update_jobs_complete = true;

//	if ( BeginTransaction() < 0 ) {
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}


	// requestJobStatus
	/////////////////////////////////////////////////////
	if ( pendingJobStatus.getNumElements() != 0 ) {
		JobStatusRequest curr_request;

		pendingJobStatus.startIterations();
		while ( pendingJobStatus.iterate( curr_request ) != 0 ) {

			int status;

			rc = GetAttributeInt( curr_request.job_id.cluster,
								  curr_request.job_id.proc,
								  ATTR_JOB_STATUS, &status );
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so return a job status of REMOVED.
					status = REMOVED;
				}
			}
				// return status
			dprintf( D_FULLDEBUG, "%d.%d job status: %d\n",
					 curr_request.job_id.cluster,
					 curr_request.job_id.proc, status );
			pendingJobStatus.remove( curr_request.job_id );
			curr_request.job_status = status;
			daemonCore->Reset_Timer( curr_request.tid, 0 );
			completedJobStatus.insert( curr_request.job_id,
									   curr_request );
		}

	}


	// Update existing jobs
	/////////////////////////////////////////////////////
	ScheddUpdateRequest *curr_request;
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n",
				curr_job->procID.cluster, curr_job->procID.proc);
		const char *attr_name;
		const char *attr_value;
		ExprTree *expr;
		bool fake_job_in_queue = false;
		curr_job->jobAd->ResetExpr();
		while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true &&
				fake_job_in_queue == false ) {
			attr_value = ExprTreeToString( expr );

			dprintf(D_FULLDEBUG,"   %s = %s\n",attr_name,attr_value);
			rc = SetAttribute( curr_job->procID.cluster,
							   curr_job->procID.proc,
							   attr_name,
							   attr_value);
			if ( rc < 0 ) {
				if ( errno == ETIMEDOUT ) {
					failure_line_num = __LINE__;
					commit_transaction = false;
					goto contact_schedd_disconnect;
				} else {
						// The job is not in the schedd's job queue. This
						// probably means that the user did a condor_rm -f,
						// so pretend that all updates for the job succeed.
						// Otherwise, we'll never make forward progress on
						// the job.
						// TODO We should also fake a job status of REMOVED
						//   to the job, so it can do what cleanup it can.
					fake_job_in_queue = true;
					break;
				}
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_updates_complete = true;


	// Delete existing jobs
	/////////////////////////////////////////////////////
	errno = 0;
	BeginTransaction();
	if ( errno == ETIMEDOUT ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		if ( curr_job->deleteFromSchedd ) {
			dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n",
					curr_job->procID.cluster, curr_job->procID.proc);
			rc = DestroyProc(curr_job->procID.cluster,
							 curr_job->procID.proc);
				// NOENT means the job doesn't exist.  Good enough for us.
			if ( rc < 0 && rc != DESTROYPROC_ENOENT) {
				failure_line_num = __LINE__;
				commit_transaction = false;
				goto contact_schedd_disconnect;
			}
		}

	}

	if ( RemoteCommitTransaction() < 0 ) {
		failure_line_num = __LINE__;
		commit_transaction = false;
		goto contact_schedd_disconnect;
	}

	schedd_deletes_complete = true;


 contact_schedd_disconnect:
	DisconnectQ( schedd, commit_transaction );

	if ( add_remove_jobs_complete == true ) {
		firstScheddContact = false;
		addJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( update_jobs_complete == true ) {
		updateJobsSignaled = false;
	} else {
		formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	if ( schedd_updates_complete == false ) {
		formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num );
		goto contact_schedd_failure;
	}

	// Clear dirty bits for all jobs updated
	if ( !dirty_job_ids.isEmpty() ) {
		ClassAd *rval;
		dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n",
				 dirty_job_ids.number() );
		dirty_job_ids.rewind();
		rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack );
		if ( rval == NULL ) {
			dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes.  CondorError: %s\n", errstack.getFullText().c_str() );
		}
		delete rval;
	}

	// Wake up jobs that had schedd updates pending and delete job
	// objects that wanted to be deleted
	pendingScheddUpdates.startIterations();

	while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) {

		curr_job = curr_request->m_job;
		curr_job->jobAd->ClearAllDirtyFlags();

		if ( curr_job->deleteFromGridmanager ) {

				// If the Job object wants to delete the job from the
				// schedd but we failed to do so, don't delete the job
				// object yet; wait until we successfully delete the job
				// from the schedd.
			if ( curr_job->deleteFromSchedd == true &&
				 schedd_deletes_complete == false ) {
				continue;
			}

				// If wantRematch is set, send a reschedule now
			if ( curr_job->wantRematch ) {
				send_reschedule = true;
			}
			pendingScheddUpdates.remove( curr_job->procID );
			pendingScheddVacates.remove( curr_job->procID );
			pendingJobStatus.remove( curr_job->procID );
			completedJobStatus.remove( curr_job->procID );
			completedScheddVacates.remove( curr_job->procID );
			delete curr_job;

		} else {
			pendingScheddUpdates.remove( curr_job->procID );

			if ( curr_request->m_notify ) {
				curr_job->SetEvaluateState();
			}
		}

		delete curr_request;
	}

	// Poke objects that wanted to be notified when a schedd update completed
	// successfully (possibly minus deletes)
	int timer_id;
	scheddUpdateNotifications.Rewind();
	while ( scheddUpdateNotifications.Next( timer_id ) ) {
		daemonCore->Reset_Timer( timer_id, 0 );
	}
	scheddUpdateNotifications.Clear();

	if ( send_reschedule == true ) {
		ScheddObj->reschedule();
	}

	// Check if we have any jobs left to manage. If not, exit.
	if ( BaseJob::JobsByProcId.getNumElements() == 0 ) {
		dprintf( D_ALWAYS, "No jobs left, shutting down\n" );
		daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM );
	}

	lastContactSchedd = time(NULL);

	if ( schedd_deletes_complete == false ) {
		error_str = "Problem using DestroyProc to delete jobs!";
		goto contact_schedd_failure;
	}

	scheddFailureCount = 0;

	// For each job that had dirty attributes, re-evaluate the policy
	dirty_job_ids.rewind();
	while ( (job_id_str = dirty_job_ids.next()) != NULL ) {
		StrToProcIdFixMe(job_id_str, job_id);
		if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) {
			curr_job->EvalPeriodicJobExpr();
		}
	}

dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n");
	return;

 contact_schedd_failure:
	scheddFailureCount++;
	if ( error_str == "" ) {
		error_str = "Failure in doContactSchedd";
	}
	if ( scheddFailureCount >= maxScheddFailures ) {
		dprintf( D_ALWAYS, "%s\n", error_str.c_str() );
		EXCEPT( "Too many failures connecting to schedd!" );
	}
	dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() );
	lastContactSchedd = time(NULL);
	RequestContactSchedd();
	return;
}
Exemplo n.º 9
0
/*ARGSUSED*/
int
main(int argc, char *argv[] )
{
	char	*tmp = NULL;
	int		reserved_swap, free_swap;
	char	*host = NULL, *cluster = NULL, *proc = NULL;
	char	*bogus_capability;
	int		i;

	set_mySubSystem( "SHADOW", SUBSYSTEM_TYPE_SHADOW );

	myDistro->Init( argc, argv );
	if( argc == 2 && strncasecmp(argv[1], "-cl", 3) == MATCH ) {
		printClassAd();
		exit( 0 );
	}

#if defined(SYSCALL_DEBUG)
	SyscallLabel = argv[0] + 7;
#endif

#if !defined(WIN32)
	install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN );
#endif

	if( argc > 1 ) {
		if( strcmp("-t",argv[1]) == MATCH ) {
			Termlog = 1;
			argv++;
			argc--;
		}
	}

	ShadowBDate = LastRestartTime = time(0);

	_EXCEPT_Cleanup = ExceptCleanup;

	MyPid = getpid();
	
	config();

	/* Start up with condor.condor privileges. */
	/*
	  we need to do this AFTER we call config() so that if CONDOR_IDS
	  is being defined in the config file, we'll get the right value
	*/ 
	set_condor_priv();

	if(Termlog)
		dprintf_set_tool_debug(get_mySubSystem()->getName(), 0);
	else
		dprintf_config( get_mySubSystem()->getName() );
	DebugId = whoami;

	// create a database connection object
	
	dprintf( D_ALWAYS, "******* Standard Shadow starting up *******\n" );
	dprintf( D_ALWAYS, "** %s\n", CondorVersion() );
	dprintf( D_ALWAYS, "** %s\n", CondorPlatform() );
	dprintf( D_ALWAYS, "*******************************************\n" );

	reserved_swap = param_integer("RESERVED_SWAP", 0);
	reserved_swap *= 1024; /* megabytes -> kb */

	bool use_sql_log = param_boolean("QUILL_USE_SQL_LOG", false);
    FILEObj = FILESQL::createInstance(use_sql_log);
	
	free_swap = sysapi_swap_space();

	dprintf( D_FULLDEBUG, "*** Reserved Swap = %d\n", reserved_swap );
	dprintf( D_FULLDEBUG, "*** Free Swap = %d\n", free_swap );
	if( reserved_swap && free_swap < reserved_swap ) {
		dprintf( D_ALWAYS, "Not enough reserved swap space\n" );
		if(FILEObj) {
		  delete FILEObj;
		}

		exit( JOB_NO_MEM );
	}

	dprintf(D_ALWAYS, "uid=%d, euid=%d, gid=%d, egid=%d\n",
		getuid(), geteuid(), getgid(), getegid());

    dprintf(D_FULLDEBUG, "argc = %d\n", argc);
    for(i = 0; i < argc; i++)
    {
        dprintf(D_FULLDEBUG, "argv[%d] = %s\n", i, argv[i]);
    }
	if( argc < 6 ) {
		usage();
	}

	if (param_boolean("SHADOW_DEBUG_WAIT", false, false)) {
		int debug_wait = 1;
		dprintf(D_ALWAYS,
				"SHADOW_DEBUG_WAIT is TRUE, waiting for debugger to attach to pid %d.\n", 
				(int)::getpid());
		while (debug_wait) {
			sleep(1);
		}
	}

	CheckSpoolVersion(SPOOL_MIN_VERSION_SHADOW_SUPPORTS,SPOOL_CUR_VERSION_SHADOW_SUPPORTS);

	if( strcmp("-pipe",argv[1]) == 0 ) {
		bogus_capability = argv[2];
		cluster = argv[3];
		proc = argv[4];
		// read the big comment in the function for why this is here.
		RemoveNewShadowDroppings(cluster, proc);
		pipe_setup( cluster, proc, bogus_capability );
	} else {
		schedd = argv[1];
		host = argv[2];
		bogus_capability = argv[3];
		cluster = argv[4];
		proc = argv[5];
		if ( argc > 6 ) {
			IpcFile = argv[6];
			dprintf(D_FULLDEBUG,"Setting IpcFile to %s\n",IpcFile);
		} else {
			IpcFile = NULL;
		}
		// read the big comment in the function for why this is here.
		RemoveNewShadowDroppings(cluster, proc);
		regular_setup( host, cluster, proc );
	}
	scheddName = getenv( EnvGetName( ENV_SCHEDD_NAME ) );

#if 0
		/* Don't want to share log file lock between child and pnarent */
	(void)close( LockFd );
	LockFd = -1;
#endif

	// initialize the user log
	initializeUserLog();

	My_Filesystem_Domain = param( "FILESYSTEM_DOMAIN" ); 
	dprintf( D_ALWAYS, "My_Filesystem_Domain = \"%s\"\n", 
			 My_Filesystem_Domain );

	My_UID_Domain = param( "UID_DOMAIN" ); 
	dprintf( D_ALWAYS, "My_UID_Domain = \"%s\"\n", My_UID_Domain );

	UseAFS = param_boolean_crufty( "USE_AFS", false ) ? TRUE : FALSE;

	UseNFS = param_boolean_crufty( "USE_NFS", false ) ? TRUE : FALSE;

	// if job specifies a checkpoint server host, this overrides
	// the config file parameters
	tmp = NULL;
	if (JobAd->LookupString(ATTR_CKPT_SERVER, &tmp) == 1) {
		if (CkptServerHost) free(CkptServerHost);
		UseCkptServer = TRUE;
		CkptServerHost = strdup(tmp);
		StarterChoosesCkptServer = FALSE;
		free(tmp);
	} else {
		free(tmp);
		if (CkptServerHost) {
            free(CkptServerHost);
        }
		CkptServerHost = param( "CKPT_SERVER_HOST" );
		UseCkptServer = FALSE;
		if( CkptServerHost && param_boolean_crufty( "USE_CKPT_SERVER", true ) ) {
			UseCkptServer = TRUE;
		}

		StarterChoosesCkptServer =
			param_boolean_crufty("STARTER_CHOOSES_CKPT_SERVER", true) ? TRUE : FALSE;
	}

		// Initialize location of our checkpoint file.  If we stored it
		// on a checkpoint server then set LastCkptServer.  Otherwise,
		// LastCkptServer should be NULL to indicate that we should
		// look on the local disk for our checkpoint file.
	LastCkptServer = NULL;
	if (JobAd->LookupString(ATTR_LAST_CKPT_SERVER,
							&LastCkptServer) == 0) {
		free(LastCkptServer);
		LastCkptServer = NULL;
	}

	// LIGO
	if (param_boolean("ALWAYS_USE_LOCAL_CKPT_SERVER", false)) {
		if (LastCkptServer) {
			char *remoteHost = NULL;
			JobAd->LookupString(ATTR_REMOTE_HOST, &remoteHost);

			char *machineName = strrchr(remoteHost, '@');
			if (machineName == NULL) {
				machineName = remoteHost;
			} else {
				machineName++;
			}

			LastCkptServer = strdup(machineName);
			CkptServerHost = strdup(machineName);

			dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, forcing LastCkptServer to %s\n", LastCkptServer);
		} else {
			dprintf(D_ALWAYS, "ALWAYS_USE_LOCAL_CKPT_SERVER is true, but checkpoint is not on server, restoring file local file\n");
		}
	}

	MaxDiscardedRunTime = param_integer( "MAX_DISCARDED_RUN_TIME", 3600 );

	CompressPeriodicCkpt = param_boolean( "COMPRESS_PERIODIC_CKPT", false );

	PeriodicSync = param_boolean( "PERIODIC_MEMORY_SYNC", false );

	CompressVacateCkpt = param_boolean( "COMPRESS_VACATE_CKPT", false );

	SlowCkptSpeed = param_integer( "SLOW_CKPT_SPEED", 0 );

	// Install signal handlers such that all signals are blocked when inside
	// the handler.
	sigset_t fullset;
	sigfillset(&fullset);
	install_sig_handler_with_mask( SIGCHLD,&fullset, reaper );

		// SIGUSR1 is sent by the schedd when a job is removed with
		// condor_rm.
	install_sig_handler_with_mask( SIGUSR1, &fullset, handle_sigusr1 );

		// SIGQUIT is sent for a fast shutdow.
	install_sig_handler_with_mask( SIGQUIT, &fullset, handle_sigquit );

		// SIGTERM is sent for a graceful shutdow.
	install_sig_handler_with_mask( SIGTERM, &fullset, handle_sigterm );


	/* Here we block the async signals.  We do this mainly because on HPUX,
	 * XDR wigs out if it is interrupted by a signal.  We do it on all
	 * platforms because it seems like a safe idea.  We unblock the signals
	 * during before select(), which is where we spend most of our time. */
	block_signal(SIGCHLD);
	block_signal(SIGUSR1);      

	/* If the completed job had been committed to the job queue,
		but for some reason the shadow exited wierdly and the
		schedd is trying to run it again, then simply write
		the job termination events and send the email as if the job had
		just ended. */
	if (terminate_is_pending() == TRUE) {
		/* This function will exit()! */
		handle_terminate_pending();
	}

	HandleSyscalls();

	Wrapup();

	/* HACK! WHOOO!!!!! Throw the old shadow away already! */
	/* This will figure out whether or not the job should go on hold AFTER the
		job has exited for whatever reason, or if the job should be allowed
		to exit. It modifies ExitReason approriately for job holding, or, get
		this, just EXCEPTs if the jobs is supposed to go into idle state and
		not leave. :) */
	/* Small change by Todd : only check the static policy if the job really
	   exited of its own accord -- we don't want to even evaluate the static
	   policy if the job exited because it was preempted, for instance */
	if (check_static_policy && 
		(ExitReason == JOB_EXITED || ExitReason == JOB_KILLED 
		     	|| ExitReason == JOB_COREDUMPED)) 
	{
		static_policy();
	}
    if( My_UID_Domain ) {
        free( My_UID_Domain );
    }
    if( My_Filesystem_Domain ) {
        free( My_Filesystem_Domain );
    }
        if(FILEObj) {
                delete FILEObj;
        }

	dprintf( D_ALWAYS, "********** Shadow Exiting(%d) **********\n",
		ExitReason );
	exit( ExitReason );
}
Exemplo n.º 10
0
static void version()
{
	printf( "%s\n%s\n", CondorVersion(), CondorPlatform() );
}
Exemplo n.º 11
0
bool
DCStarter::peek(bool transfer_stdout, ssize_t &stdout_offset, bool transfer_stderr, ssize_t &stderr_offset, const std::vector<std::string> &filenames, std::vector<ssize_t> &offsets, size_t max_bytes, bool &retry_sensible, PeekGetFD &next, std::string &error_msg, unsigned timeout, const std::string &sec_session_id, DCTransferQueue *xfer_q)
{
	compat_classad::ClassAd ad;
	ad.InsertAttr(ATTR_JOB_OUTPUT, transfer_stdout);
	ad.InsertAttr("OutOffset", stdout_offset);
	ad.InsertAttr(ATTR_JOB_ERROR, transfer_stderr);
	ad.InsertAttr("ErrOffset", stderr_offset);
	ad.InsertAttr(ATTR_VERSION, CondorVersion());

	size_t total_files = 0;
	total_files += transfer_stdout ? 1 : 0;
	total_files += transfer_stderr ? 1 : 0;

	if (filenames.size())
	{
		total_files += filenames.size();
		std::vector<classad::ExprTree *> filelist; filelist.reserve(filenames.size());
		std::vector<classad::ExprTree *> offsetlist; offsetlist.reserve(filenames.size());
		std::vector<ssize_t>::const_iterator it2 = offsets.begin();
		for (std::vector<std::string>::const_iterator it = filenames.begin();
			it != filenames.end() && it2 != offsets.end();
			it++, it2++)
		{
			classad::Value value;
			value.SetStringValue(*it);
			filelist.push_back(classad::Literal::MakeLiteral(value));
			value.SetIntegerValue(*it2);
			offsetlist.push_back(classad::Literal::MakeLiteral(value));
		}
		classad::ExprTree *list(classad::ExprList::MakeExprList(filelist));
		ad.Insert("TransferFiles", list);
		list = classad::ExprList::MakeExprList(offsetlist);
		ad.Insert("TransferOffsets", list);
	}

	ad.InsertAttr(ATTR_MAX_TRANSFER_BYTES, static_cast<long long>(max_bytes));

	ReliSock sock;

	if( !connectSock(&sock, timeout, NULL) ) {
		error_msg = "Failed to connect to starter";
		return false;
	}

	if( !startCommand(STARTER_PEEK, &sock, timeout, NULL, NULL, false, sec_session_id.c_str()) ) {
		error_msg = "Failed to send START_PEEK to starter";
		return false;
	}
	sock.encode();
	if (!putClassAd(&sock, ad) || !sock.end_of_message()) {
		error_msg = "Failed to send request to starter";
		return false;
	}

	compat_classad::ClassAd response;
	sock.decode();
	if (!getClassAd(&sock, response) || !sock.end_of_message())
	{
		error_msg = "Failed to read response for peeking at logs.";
		return false;
	}
	dPrintAd(D_FULLDEBUG, response);

	bool success = false;
	if (!response.EvaluateAttrBool(ATTR_RESULT, success) || !success)
	{
		response.EvaluateAttrBool(ATTR_RETRY, retry_sensible);
		error_msg = "Remote operation failed.";
		response.EvaluateAttrString(ATTR_ERROR_STRING, error_msg);
		return false;
	}
	classad::Value valueX;
	classad_shared_ptr<classad::ExprList> list;
	if (!response.EvaluateAttr("TransferFiles", valueX) || !valueX.IsSListValue(list))
	{
		error_msg = "Unable to evaluate starter response";
		return false;
	}
	classad_shared_ptr<classad::ExprList> offlist;
	if (!response.EvaluateAttr("TransferOffsets", valueX) || !valueX.IsSListValue(offlist))
	{
		error_msg = "Unable to evaluate starter response (missing offsets)";
		return false;
	}

	size_t remaining = max_bytes;
	size_t file_count = 0;
	classad::ExprList::const_iterator it2 = offlist->begin();
	for (classad::ExprList::const_iterator it = list->begin();
		it != list->end() && it2 != offlist->end();
		it++, it2++)
	{
		classad::Value value;
		(*it2)->Evaluate(value);
		off_t off = -1;
		value.IsIntegerValue(off);
		(*it)->Evaluate(value);
		std::string filename;
		int64_t xfer_fd = -1;
		if (!value.IsStringValue(filename) && value.IsIntegerValue(xfer_fd))
		{
			if (xfer_fd == 0) filename = "_condor_stdout";
			if (xfer_fd == 1) filename = "_condor_stderr";
		}
		int fd = next.getNextFD(filename);
		filesize_t size = -1;
		int retval;
		if ((retval = sock.get_file(&size, fd, false, false, remaining, xfer_q)) && (retval != GET_FILE_MAX_BYTES_EXCEEDED))
		{
			error_msg = "Internal error when transferring file " + filename;
		}
		else if (size >= 0)
		{
			remaining -= max_bytes;
			file_count++;
			off += size;
		}
		else
		{
			error_msg = "Failed to transfer file " + filename;
		}
		if (xfer_fd == 0)
		{
			stdout_offset = off;
			//dprintf(D_FULLDEBUG, "New stdout offset: %ld\n", stdout_offset);
		}
		else if (xfer_fd == 1)
		{
			stderr_offset = off;
		}
		else
		{
			std::vector<ssize_t>::iterator it4 = offsets.begin();
			for (std::vector<std::string>::const_iterator it3 = filenames.begin();
				it3 != filenames.end() && it4 != offsets.end();
				it3++, it4++)
			{
				if (*it3 == filename) *it4 = off;
			}
		}
	}
	size_t remote_file_count;
	if (!sock.get(remote_file_count) || !sock.end_of_message())
	{
		error_msg = "Unable to get remote file count.";
		return false;
	}
	if (file_count != remote_file_count)
	{
		std::stringstream ss;
		ss << "Recieved " << file_count << " files, but remote side thought it sent " << remote_file_count << " files";
		error_msg = ss.str();
		return false;
	}
	if ((total_files != file_count) && !error_msg.size())
	{
		error_msg = "At least one file transfer failed.";
		return false;
	}
	return true;
}
Exemplo n.º 12
0
bool 
DCSchedd::receiveJobSandbox(const char* constraint, CondorError * errstack, int * numdone /*=0*/)
{
	if(numdone) { *numdone = 0; }
	ExprTree *tree = NULL;
	const char *lhstr;
	int reply;
	int i;
	ReliSock rsock;
	int JobAdsArrayLen;
	bool use_new_command = true;

	if ( version() ) {
		CondorVersionInfo vi( version() );
		if ( vi.built_since_version(6,7,7) ) {
			use_new_command = true;
		} else {
			use_new_command = false;
		}
	}

		// // // // // // // //
		// On the wire protocol
		// // // // // // // //

	rsock.timeout(20);   // years of research... :)
	if( ! rsock.connect(_addr) ) {
		dprintf( D_ALWAYS, "DCSchedd::receiveJobSandbox: "
				 "Failed to connect to schedd (%s)\n", _addr );
		return false;
	}
	if ( use_new_command ) {
		if( ! startCommand(TRANSFER_DATA_WITH_PERMS, (Sock*)&rsock, 0,
						   errstack) ) {
			dprintf( D_ALWAYS, "DCSchedd::receiveJobSandbox: "
					 "Failed to send command (TRANSFER_DATA_WITH_PERMS) "
					 "to the schedd\n" );
			return false;
		}
	} else {
		if( ! startCommand(TRANSFER_DATA, (Sock*)&rsock, 0, errstack) ) {
			dprintf( D_ALWAYS, "DCSchedd::receiveJobSandbox: "
					 "Failed to send command (TRANSFER_DATA) "
					 "to the schedd\n" );
			return false;
		}
	}

		// First, if we're not already authenticated, force that now. 
	if (!forceAuthentication( &rsock, errstack )) {
		dprintf( D_ALWAYS, 
			"DCSchedd::receiveJobSandbox: authentication failure: %s\n",
			errstack ? errstack->getFullText().c_str() : "" );
		return false;
	}

	rsock.encode();

		// Send our version if using the new command
	if ( use_new_command ) {
			// Need to use a named variable, else the wrong version of	
			// code() is called.
		char *my_version = strdup( CondorVersion() );
		if ( !rsock.code(my_version) ) {
			dprintf(D_ALWAYS,"DCSchedd:receiveJobSandbox: "
					"Can't send version string to the schedd\n");
			free( my_version );
			return false;
		}
		free( my_version );
	}

		// Send the constraint
	char * nc_constraint = strdup( constraint );	// de-const
	if ( !rsock.code(nc_constraint) ) {
		free( nc_constraint );
		dprintf(D_ALWAYS,"DCSchedd:receiveJobSandbox: "
				"Can't send JobAdsArrayLen to the schedd\n");
		return false;
	}
	free( nc_constraint );

	if ( !rsock.end_of_message() ) {
		std::string errmsg;
		formatstr(errmsg,
				"Can't send initial message (version + constraint) to schedd (%s)",
				_addr);

		dprintf(D_ALWAYS,"DCSchedd::receiveJobSandbox: %s\n", errmsg.c_str());

		if( errstack ) {
			errstack->push(
				"DCSchedd::receiveJobSandbox",
				CEDAR_ERR_EOM_FAILED,
				errmsg.c_str());
		}
		return false;
	}

		// Now, read how many jobs matched the constraint.
	rsock.decode();
	if ( !rsock.code(JobAdsArrayLen) ) {
		std::string errmsg;
		formatstr(errmsg,
				"Can't receive JobAdsArrayLen from the schedd (%s)",
				_addr);

		dprintf(D_ALWAYS,"DCSchedd::receiveJobSandbox: %s\n", errmsg.c_str());

		if( errstack ) {
			errstack->push(
				"DCSchedd::receiveJobSandbox",
				CEDAR_ERR_GET_FAILED,
				errmsg.c_str());
		}
		return false;
	}

	rsock.end_of_message();

	dprintf(D_FULLDEBUG,"DCSchedd:receiveJobSandbox: "
		"%d jobs matched my constraint (%s)\n",
		JobAdsArrayLen, constraint);

		// Now read all the files via the file transfer object
	for (i=0; i<JobAdsArrayLen; i++) {
		FileTransfer ftrans;
		ClassAd job;

			// grab job ClassAd
		if ( !getClassAd(&rsock, job) ) {
			std::string errmsg;
			formatstr(errmsg, "Can't receive job ad %d from the schedd", i);

			dprintf(D_ALWAYS, "DCSchedd::receiveJobSandbox: %s\n", errmsg.c_str());

			if( errstack ) {
				errstack->push(
							   "DCSchedd::receiveJobSandbox",
							   CEDAR_ERR_GET_FAILED,
							   errmsg.c_str());
			}
			return false;
		}

		rsock.end_of_message();

			// translate the job ad by replacing the 
			// saved SUBMIT_ attributes
		job.ResetExpr();
		while( job.NextExpr(lhstr, tree) ) {
			if ( lhstr && strncasecmp("SUBMIT_",lhstr,7)==0 ) {
					// this attr name starts with SUBMIT_
					// compute new lhs (strip off the SUBMIT_)
				const char *new_attr_name = strchr(lhstr,'_');
				ExprTree * pTree;
				ASSERT(new_attr_name);
				new_attr_name++;
					// insert attribute
				pTree = tree->Copy();
				job.Insert(new_attr_name, pTree, false);
			}
		}	// while next expr

		if ( !ftrans.SimpleInit(&job,false,false,&rsock) ) {
			if( errstack ) {
				int cluster = -1, proc = -1;
				job.LookupInteger(ATTR_CLUSTER_ID,cluster);
				job.LookupInteger(ATTR_PROC_ID,proc);
				errstack->pushf(
					"DCSchedd::receiveJobSandbox",
					FILETRANSFER_INIT_FAILED,
					"File transfer initialization failed for target job %d.%d",
					cluster, proc );
			}
			return false;
		}
		// We want files to be copied to their final places, so apply
		// any filename remaps when downloading.
		if ( !ftrans.InitDownloadFilenameRemaps(&job) ) {
			return false;
		}
		if ( use_new_command ) {
			ftrans.setPeerVersion( version() );
		}
		if ( !ftrans.DownloadFiles() ) {
			if( errstack ) {
				FileTransfer::FileTransferInfo ft_info = ftrans.GetInfo();

				int cluster = -1, proc = -1;
				job.LookupInteger(ATTR_CLUSTER_ID,cluster);
				job.LookupInteger(ATTR_PROC_ID,proc);
				errstack->pushf(
					"DCSchedd::receiveJobSandbox",
					FILETRANSFER_DOWNLOAD_FAILED,
					"File transfer failed for target job %d.%d: %s",
					cluster, proc, ft_info.error_desc.Value() );
			}
			return false;
		}
	}	
		
	rsock.end_of_message();

	rsock.encode();

	reply = OK;
	rsock.code(reply);
	rsock.end_of_message();

	if(numdone) { *numdone = JobAdsArrayLen; }

	return true;
}
Exemplo n.º 13
0
//---------------------------------------------------------------------------
void main_init (int argc, char ** const argv) {

	printf ("Executing condor dagman ... \n");

		// flag used if DAGMan is invoked with -WaitForDebug so we
		// wait for a developer to attach with a debugger...
	volatile int wait_for_debug = 0;

		// process any config vars -- this happens before we process
		// argv[], since arguments should override config settings
	dagman.Config();

	// The DCpermission (last parm) should probably be PARENT, if it existed
    daemonCore->Register_Signal( SIGUSR1, "SIGUSR1",
                                 (SignalHandler) main_shutdown_remove,
                                 "main_shutdown_remove", NULL);

/****** FOR TESTING *******
    daemonCore->Register_Signal( SIGUSR2, "SIGUSR2",
                                 (SignalHandler) main_testing_stub,
                                 "main_testing_stub", NULL);
****** FOR TESTING ********/
    debug_progname = condor_basename(argv[0]);

		// condor_submit_dag version from .condor.sub
	bool allowVerMismatch = false;
	const char *csdVersion = "undefined";

	int i;
    for (i = 0 ; i < argc ; i++) {
        debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] );
    }

    if (argc < 2) Usage();  //  Make sure an input file was specified

		// get dagman job id from environment, if it's there
		// (otherwise it will be set to "-1.-1.-1")
	dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) );

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
		// Minimum legal version for a .condor.sub file to be compatible
		// with this condor_dagman binary.

		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
		// Be sure to change this if the arguments or environment
		// passed to condor_dagman change in an incompatible way!!
		// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

	struct DagVersionData {
		int majorVer;
		int minorVer;
		int subMinorVer;
	};
	const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 };

		// Construct a string of the minimum submit file version.
	MyString minSubmitVersionStr;
	minSubmitVersionStr.formatstr( "%d.%d.%d",
				MIN_SUBMIT_FILE_VERSION.majorVer,
				MIN_SUBMIT_FILE_VERSION.minorVer,
				MIN_SUBMIT_FILE_VERSION.subMinorVer );
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    //
    // Process command-line arguments
    //
    for (i = 1; i < argc; i++) {
        if( !strcasecmp( "-Debug", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No debug level specified\n" );
                Usage();
            }
            debug_level = (debug_level_t) atoi (argv[i]);
        } else if( !strcasecmp( "-Lockfile", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" );
                Usage();
            }
            lockFileName = argv[i];
        } else if( !strcasecmp( "-Help", argv[i] ) ) {
            Usage();
        } else if (!strcasecmp( "-Dag", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No DAG specified\n" );
                Usage();
            }
			dagman.dagFiles.append( argv[i] );
        } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxIdle\n" );
                Usage();
            }
            dagman.maxIdle = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxJobs\n" );
                Usage();
            }
            dagman.maxJobs = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) {
			debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with "
						   "-MaxPre and -MaxPost arguments\n" );
			Usage();
        } else if( !strcasecmp( "-MaxPre", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPre\n" );
                Usage();
            }
            dagman.maxPreScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-MaxPost", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT,
							  "Integer missing after -MaxPost\n" );
                Usage();
            }
            dagman.maxPostScripts = atoi( argv[i] );
        } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) {
			debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is "
						"ignored; please use the DAGMAN_ALLOW_EVENTS "
						"config parameter instead\n");
			check_warning_strictness( DAG_STRICT_1 );

        } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) {
			dagman.allowLogError = true;

        } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) {
			dagman._runPost = false;

        } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) {
			wait_for_debug = 1;

        } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) {
			dagman.useDagDir = true;

        } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" );
                Usage();
            }
            dagman.autoRescue = (atoi( argv[i] ) != 0);

        } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" );
                Usage();
            }
            dagman.doRescueFrom = atoi (argv[i]);

        } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" );
                Usage();
            }
			csdVersion = argv[i];

        } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) {
			allowVerMismatch = true;

        } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) {
			dagman.dumpRescueDag = true;

        } else if( !strcasecmp( "-verbose", argv[i] ) ) {
			dagman._submitDagDeepOpts.bVerbose = true;

        } else if( !strcasecmp( "-force", argv[i] ) ) {
			dagman._submitDagDeepOpts.bForce = true;
		
        } else if( !strcasecmp( "-notification", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No notification value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strNotification = argv[i];

        } else if( !strcasecmp( "-dagman", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No dagman value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strDagmanPath = argv[i];

        } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) {
            i++;
            if( argc <= i || strcmp( argv[i], "" ) == 0 ) {
                debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" );
                Usage();
            }
			dagman._submitDagDeepOpts.strOutfileDir = argv[i];

        } else if( !strcasecmp( "-update_submit", argv[i] ) ) {
			dagman._submitDagDeepOpts.updateSubmit = true;

        } else if( !strcasecmp( "-import_env", argv[i] ) ) {
			dagman._submitDagDeepOpts.importEnv = true;

        } else if( !strcasecmp( "-priority", argv[i] ) ) {
		++i;
		if( i >= argc || strcmp( argv[i], "" ) == 0 ) {
			debug_printf( DEBUG_NORMAL, "No priority value specified\n");
			Usage();
		}
		dagman._submitDagDeepOpts.priority = atoi(argv[i]);
		} else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) {
			dagman._submitDagDeepOpts.always_use_node_log = false;
        } else {
    		debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n",
						argv[i] );
			Usage();
		}
    }

	dagman.dagFiles.rewind();
	dagman.primaryDagFile = dagman.dagFiles.next();
	dagman.multiDags = (dagman.dagFiles.number() > 1);

	MyString tmpDefaultLog;
	if ( dagman._defaultNodeLog != NULL ) {
		tmpDefaultLog = dagman._defaultNodeLog;
		free( dagman._defaultNodeLog );
	} else {
		tmpDefaultLog = dagman.primaryDagFile + ".nodes.log";
	}

		// Force default log file path to be absolute so it works
		// with -usedagdir and DIR nodes.
	CondorError errstack;
	if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) {
       	debug_printf( DEBUG_QUIET, "Unable to convert default log "
					"file name to absolute path: %s\n",
					errstack.getFullText().c_str() );
		dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR );
		DC_Exit( EXIT_ERROR );
	}
	dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() );
	debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n",
				dagman._defaultNodeLog);

    //
    // Check the arguments
    //

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// Checking for version compatibility between the .condor.sub
	// file and this condor_dagman binary...

	// Note: if we're in recovery mode and the submit file version
	// causes us to quit, we leave any existing node jobs still
	// running -- may want to change that eventually.  wenger 2009-10-13.

		// Version of the condor_submit_dag that created our submit file.
	CondorVersionInfo submitFileVersion( csdVersion );

		// Version of this condor_dagman binary.
	CondorVersionInfo dagmanVersion;

		// Just generate this message fragment in one place.
	MyString versionMsg;
	versionMsg.formatstr("the version (%s) of this DAG's Condor submit "
				"file (created by condor_submit_dag)", csdVersion );

		// Make sure version in submit file is valid.
	if( !submitFileVersion.is_valid() ) {
		if ( !allowVerMismatch ) {
        	debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n",
						versionMsg.Value() );
			DC_Exit( EXIT_ERROR );
		} else {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; "
						"continuing because of -AllowVersionMismatch flag\n",
						versionMsg.Value() );
		}

		// Make sure .condor.sub file is recent enough.
	} else if ( submitFileVersion.compare_versions(
				CondorVersion() ) != 0 ) {

		if( !submitFileVersion.built_since_version(
					MIN_SUBMIT_FILE_VERSION.majorVer,
					MIN_SUBMIT_FILE_VERSION.minorVer,
					MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) {
			if ( !allowVerMismatch ) {
        		debug_printf( DEBUG_QUIET, "Error: %s is older than "
							"oldest permissible version (%s)\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
				DC_Exit( EXIT_ERROR );
			} else {
        		debug_printf( DEBUG_NORMAL, "Warning: %s is older than "
							"oldest permissible version (%s); continuing "
							"because of -AllowVersionMismatch flag\n",
							versionMsg.Value(), minSubmitVersionStr.Value() );
			}

			// Warn if .condor.sub file is a newer version than this binary.
		} else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) {
        	debug_printf( DEBUG_NORMAL, "Warning: %s is newer than "
						"condor_dagman version (%s)\n", versionMsg.Value(),
						CondorVersion() );
			check_warning_strictness( DAG_STRICT_3 );
		} else {
        	debug_printf( DEBUG_NORMAL, "Note: %s differs from "
						"condor_dagman version (%s), but the "
						"difference is permissible\n", 
						versionMsg.Value(), CondorVersion() );
		}
	}
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    if( dagman.primaryDagFile == "" ) {
        debug_printf( DEBUG_SILENT, "No DAG file was specified\n" );
        Usage();
    }
    if (lockFileName == NULL) {
        debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" );
        Usage();
    }
    if( dagman.maxJobs < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n");
        Usage();
    }
    if( dagman.maxPreScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" );
        Usage();
    }
    if( dagman.maxPostScripts < 0 ) {
        debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" );
        Usage();
    }
    if( dagman.doRescueFrom < 0 ) {
        debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" );
        Usage();
    }

    debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n",
                   lockFileName );
	if ( dagman.dagFiles.number() == 1 ) {
    	debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n",
				  	dagman.primaryDagFile.Value() );
	} else {
		MyString msg = "DAG Input files are ";
		dagman.dagFiles.rewind();
		const char *dagFile;
		while ( (dagFile = dagman.dagFiles.next()) != NULL ) {
			msg += dagFile;
			msg += " ";
		}
		msg += "\n";
    	debug_printf( DEBUG_VERBOSE, "%s", msg.Value() );
	}

		// if requested, wait for someone to attach with a debugger...
	while( wait_for_debug ) { }

    {
		MyString cwd;
		if( !condor_getcwd(cwd) ) {
			cwd = "<null>";
		}
        debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value());

		char *temp = my_username();
		debug_printf( DEBUG_DEBUG_1, "Current user is %s\n",
					   temp ? temp : "<null>" );
		if( temp ) {
			free( temp );
		}
    }

		//
		// Figure out the rescue DAG to run, if any (this is with "new-
		// style" rescue DAGs).
		//
	int rescueDagNum = 0;
	MyString rescueDagMsg;

	if ( dagman.doRescueFrom != 0 ) {
		rescueDagNum = dagman.doRescueFrom;
		rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum );
		RenameRescueDagsAfter( dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum );

	} else if ( dagman.autoRescue ) {
		rescueDagNum = FindLastRescueDagNum(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum );
		rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum );
	}

		//
		// Fill in values in the deep submit options that we haven't
		// already set.
		//
	dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError;
	dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir;
	dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue;
	dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom;
	dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch;
	dagman._submitDagDeepOpts.recurse = false;

    //
    // Create the DAG
    //

	// Note: a bunch of the parameters we pass here duplicate things
	// in submitDagOpts, but I'm keeping them separate so we don't have to
	// bother to construct a new SubmitDagOtions object for splices.
	// wenger 2010-03-25
    dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs,
						  dagman.maxPreScripts, dagman.maxPostScripts,
						  dagman.allowLogError, dagman.useDagDir,
						  dagman.maxIdle, dagman.retrySubmitFirst,
						  dagman.retryNodeFirst, dagman.condorRmExe,
						  dagman.storkRmExe, &dagman.DAGManJobId,
						  dagman.prohibitMultiJobs, dagman.submitDepthFirst,
						  dagman._defaultNodeLog,
						  dagman._generateSubdagSubmits,
						  &dagman._submitDagDeepOpts,
						  false ); /* toplevel dag! */

    if( dagman.dag == NULL ) {
        EXCEPT( "ERROR: out of memory!\n");
    }

	dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit );
	dagman.dag->SetAllowEvents( dagman.allow_events );
	dagman.dag->SetConfigFile( dagman._dagmanConfigFile );
	dagman.dag->SetMaxJobHolds( dagman._maxJobHolds );
	dagman.dag->SetPostRun(dagman._runPost);
	if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line
		dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority);
	} else if( dagman._defaultPriority != 0 ) { // From config file
		dagman.dag->SetDefaultPriority(dagman._defaultPriority);
		dagman._submitDagDeepOpts.priority = dagman._defaultPriority;
	}

    //
    // Parse the input files.  The parse() routine
    // takes care of adding jobs and dependencies to the DagMan
    //
	dagman.mungeNodeNames = (dagman.dagFiles.number() > 1);
	parseSetDoNameMunge( dagman.mungeNodeNames );
   	debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", 
		dagman.dagFiles.number() );
	dagman.dagFiles.rewind();
	char *dagFile;

	// Here we make a copy of the dagFiles for iteration purposes. Deep inside
	// of the parsing, copies of the dagman.dagFile string list happen which
	// mess up the iteration of this list.
	StringList sl( dagman.dagFiles );
	sl.rewind();
	while ( (dagFile = sl.next()) != NULL ) {
    	debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile );

    	if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							false, true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored.
			unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}
	if( dagman.dag->GetDefaultPriority() != 0 ) {
		dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag
	}
	dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId );
	if ( rescueDagNum > 0 ) {
			// Get our Pegasus sequence numbers set correctly.
		dagman.dag->GetJobstateLog().InitializeRescue();
	}

	// lift the final set of splices into the main dag.
	dagman.dag->LiftSplices(SELF);

		//
		// Actually parse the "new-new" style (partial DAG info only)
		// rescue DAG here.  Note: this *must* be done after splices
		// are lifted!
		//
	if ( rescueDagNum > 0 ) {
		dagman.rescueFileToRun = RescueDagName(
					dagman.primaryDagFile.Value(),
					dagman.multiDags, rescueDagNum );
		debug_printf ( DEBUG_QUIET, "%s; running %s in combination with "
					"normal DAG file%s\n", rescueDagMsg.Value(),
					dagman.rescueFileToRun.Value(),
					dagman.multiDags ? "s" : "");
		debug_printf ( DEBUG_QUIET,
					"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n");

		debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n",
					dagman.rescueFileToRun.Value() );

			// Turn off node name munging for the rescue DAG, because
			// it will already have munged node names.
		parseSetDoNameMunge( false );

    	if( !parse( dagman.dag, dagman.rescueFileToRun.Value(),
					dagman.useDagDir ) ) {
			if ( dagman.dumpRescueDag ) {
					// Dump the rescue DAG so we can see what we got
					// in the failed parse attempt.
    			debug_printf( DEBUG_QUIET, "Dumping rescue DAG "
							"because of -DumpRescue flag\n" );
				dagman.dag->Rescue( dagman.primaryDagFile.Value(),
							dagman.multiDags, dagman.maxRescueDagNum,
							true, false );
			}
			
			dagman.dag->RemoveRunningJobs(dagman, true);
			MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored.
			unlink( lockFileName );
			dagman.CleanUp();
			
				// Note: debug_error calls DC_Exit().
        	debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n",
					 	dagFile );
    	}
	}

	dagman.dag->CheckThrottleCats();

	// fix up any use of $(JOB) in the vars values for any node
	dagman.dag->ResolveVarsInterpolations();

/*	debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/
/*	dagman.dag->PrintJobList();*/

#ifndef NOT_DETECT_CYCLE
	if( dagman.startup_cycle_detect && dagman.dag->isCycle() )
	{
		// Note: maybe we should run the final node here, if there is one.
		// wenger 2011-12-19.
		debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n");
	}
#endif
    debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n",
				  dagman.dag->NumNodes( true ) );

	MyString firstLocation;
	if ( dagman.dag->GetReject( firstLocation ) ) {
    	debug_printf( DEBUG_QUIET, "Exiting because of REJECT "
					"specification in %s.  This most likely means "
					"that the DAG file was produced with the -DumpRescue "
					"flag when parsing the original DAG failed.\n",
					firstLocation.Value() );
		DC_Exit( EXIT_ERROR );
		return;
	}

	dagman.dag->DumpDotFile();

	if ( dagman.dumpRescueDag ) {
    	debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting "
					"because of -DumpRescue flag\n" );
		dagman.dag->Rescue( dagman.primaryDagFile.Value(),
					dagman.multiDags, dagman.maxRescueDagNum, false,
					false, false );
		ExitSuccess();
		return;
	}

    //------------------------------------------------------------------------
    // Bootstrap and Recovery
    //
    // If the Lockfile exists, this indicates a premature termination
    // of a previous run of Dagman. If condor log is also present,
    // we run in recovery mode
  
    // If the Daglog is not present, then we do not run in recovery
    // mode
  
    {
      bool recovery = access(lockFileName,  F_OK) == 0;
      
        if (recovery) {
            debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n",
                           lockFileName);
			if (dagman.abortDuplicates) {
				if (util_check_lock_file(lockFileName) == 1) {
        			debug_printf( DEBUG_QUIET, "Aborting because it "
							"looks like another instance of DAGMan is "
							"currently running on this DAG; if that is "
							"not the case, delete the lock file (%s) "
							"and re-submit the DAG.\n", lockFileName );
					dagman.dag->GetJobstateLog().
								WriteDagmanFinished( EXIT_RESTART );
    				dagman.CleanUp();
					DC_Exit( EXIT_ERROR );
					// We should never get to here!
				}
			}
        }

			//
			// If this DAGMan continues, it should overwrite the lock
			// file if it exists.
			//
		util_create_lock_file(lockFileName, dagman.abortDuplicates);

        debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n");
        if( !dagman.dag->Bootstrap( recovery ) ) {
            dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 );
            debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n");
        }
    }

    debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" );
    daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, 
				condor_event_timer, "condor_event_timer" );

	dagman.dag->SetPendingNodeReportInterval(
				dagman.pendingReportInterval );
}
Exemplo n.º 14
0
void
BaseShadow::baseInit( ClassAd *job_ad, const char* schedd_addr, const char *xfer_queue_contact_info )
{
	int pending = FALSE;

	if( ! job_ad ) {
		EXCEPT("baseInit() called with NULL job_ad!");
	}
	jobAd = job_ad;

	if (sendUpdatesToSchedd && ! is_valid_sinful(schedd_addr)) {
		EXCEPT("schedd_addr not specified with valid address");
	}
	scheddAddr = sendUpdatesToSchedd ? strdup( schedd_addr ) : strdup("noschedd");

	m_xfer_queue_contact_info = xfer_queue_contact_info;

	if ( !jobAd->LookupString(ATTR_OWNER, owner)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_OWNER);
	}

	if( !jobAd->LookupInteger(ATTR_CLUSTER_ID, cluster)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_CLUSTER_ID);
	}

	if( !jobAd->LookupInteger(ATTR_PROC_ID, proc)) {
		EXCEPT("Job ad doesn't contain a %s attribute.", ATTR_PROC_ID);
	}


		// Grab the GlobalJobId if we've got it.
	if( ! jobAd->LookupString(ATTR_GLOBAL_JOB_ID, &gjid) ) {
		gjid = NULL;
	}

	// grab the NT domain if we've got it
	jobAd->LookupString(ATTR_NT_DOMAIN, domain);
	if ( !jobAd->LookupString(ATTR_JOB_IWD, iwd)) {
		EXCEPT("Job ad doesn't contain an %s attribute.", ATTR_JOB_IWD);
	}

	if( !jobAd->LookupFloat(ATTR_BYTES_SENT, prev_run_bytes_sent) ) {
		prev_run_bytes_sent = 0;
	}
	if( !jobAd->LookupFloat(ATTR_BYTES_RECVD, prev_run_bytes_recvd) ) {
		prev_run_bytes_recvd = 0;
	}

		// construct the core file name we'd get if we had one.
	MyString tmp_name = iwd;
	tmp_name += DIR_DELIM_CHAR;
	tmp_name += "core.";
	tmp_name += cluster;
	tmp_name += '.';
	tmp_name += proc;
	core_file_name = strdup( tmp_name.Value() );

        // put the shadow's sinful string into the jobAd.  Helpful for
        // the mpi shadow, at least...and a good idea in general.
	MyString tmp_addr = ATTR_MY_ADDRESS;
	tmp_addr += "=\"";
	tmp_addr += daemonCore->InfoCommandSinfulString();
	tmp_addr += '"';
    if ( !jobAd->Insert( tmp_addr.Value() )) {
        EXCEPT( "Failed to insert %s!", ATTR_MY_ADDRESS );
    }

	DebugId = display_dprintf_header;
	
	config();

		// Make sure we've got enough swap space to run
	checkSwap();

	// handle system calls with Owner's privilege
// XXX this belong here?  We'll see...
	// Calling init_user_ids() while in user priv causes badness.
	// Make sure we're in another priv state.
	set_condor_priv();
	if ( !init_user_ids(owner.Value(), domain.Value())) {
		dprintf(D_ALWAYS, "init_user_ids() failed as user %s\n",owner.Value() );
		// uids.C will EXCEPT when we set_user_priv() now
		// so there's not much we can do at this point
		
#if ! defined(WIN32)
		if ( param_boolean( "SHADOW_RUN_UNKNOWN_USER_JOBS", false ) )
		{
			dprintf(D_ALWAYS, "trying init_user_ids() as user nobody\n" );
			
			owner="nobody";
			domain=NULL;
			if (!init_user_ids(owner.Value(), domain.Value()))
			{
				dprintf(D_ALWAYS, "init_user_ids() failed!\n");
			}
			else
			{
				jobAd->Assign( ATTR_JOB_RUNAS_OWNER, "FALSE" );
				m_RunAsNobody=true;
				dprintf(D_ALWAYS, "init_user_ids() now running as user nobody\n");
			}
		}
#endif

	}
	set_user_priv();
	daemonCore->Register_Priv_State( PRIV_USER );

	dumpClassad( "BaseShadow::baseInit()", this->jobAd, D_JOB );

		// initialize the UserPolicy object
	shadow_user_policy.init( jobAd, this );

		// setup an object to keep our job ad updated to the schedd's
		// permanent job queue.  this clears all the dirty bits on our
		// copy of the classad, so anything we touch after this will
		// be updated to the schedd when appropriate.

		// Unless we got a command line arg asking us not to
	if (sendUpdatesToSchedd) {
		// the usual case
		job_updater = new QmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	} else {
		job_updater = new NullQmgrJobUpdater( jobAd, scheddAddr, CondorVersion() );
	}

		// init user log; hold on failure
		// NOTE: job_updater must be initialized _before_ initUserLog(),
		// in order to handle the case of the job going on hold as a
		// result of failure in initUserLog().
	initUserLog();

		// change directory; hold on failure
	if ( cdToIwd() == -1 ) {
		EXCEPT("Could not cd to initial working directory");
	}

		// check to see if this invocation of the shadow is just to write
		// a terminate event and exit since this job had been recorded as
		// pending termination, but somehow the job didn't leave the queue
		// and the schedd is trying to restart it again..
	if( jobAd->LookupInteger(ATTR_TERMINATION_PENDING, pending)) {
		if (pending == TRUE) {
			// If the classad of this job "thinks" that this job should be
			// finished already, let's enact that belief.
			// This function does not return.
			this->terminateJob(US_TERMINATE_PENDING);
		}
	}

		// If we need to claim the startd before activating the claim
	int wantClaiming = 0;
	jobAd->LookupBool(ATTR_CLAIM_STARTD, wantClaiming);
	if (wantClaiming) {
		MyString startdSinful;
		MyString claimid;

			// Pull startd addr and claimid out of the jobad
		jobAd->LookupString(ATTR_STARTD_IP_ADDR, startdSinful);
		jobAd->LookupString(ATTR_CLAIM_ID, claimid);

		dprintf(D_ALWAYS, "%s is true, trying to claim startd %s\n", ATTR_CLAIM_STARTD, startdSinful.Value());

		classy_counted_ptr<DCStartd> startd = new DCStartd("description", NULL, startdSinful.Value(), claimid.Value());
	
		classy_counted_ptr<DCMsgCallback> cb = 
			new DCMsgCallback((DCMsgCallback::CppFunction)&BaseShadow::startdClaimedCB,
			this, jobAd);
																 
			// this can't fail, will always call the callback
		startd->asyncRequestOpportunisticClaim(jobAd, 
											   "description", 
											   daemonCore->InfoCommandSinfulString(), 
											   1200 /*alive interval*/, 
											   20 /* net timeout*/, 
											   100 /*total timeout*/, 
											   cb);
	}
}
Exemplo n.º 15
0
//---------------------------------------------------------------------------
void writeSubmitFile(/* const */ SubmitDagDeepOptions &deepOpts,
			/* const */ SubmitDagShallowOptions &shallowOpts)
{
	FILE *pSubFile = safe_fopen_wrapper_follow(shallowOpts.strSubFile.Value(), "w");
	if (!pSubFile)
	{
		fprintf( stderr, "ERROR: unable to create submit file %s\n",
				 shallowOpts.strSubFile.Value() );
		exit( 1 );
	}

	const char *executable = NULL;
	MyString valgrindPath; // outside if so executable is valid!
	if ( shallowOpts.runValgrind ) {
		valgrindPath = which( valgrind_exe );
		if ( valgrindPath == "" ) {
			fprintf( stderr, "ERROR: can't find %s in PATH, aborting.\n",
				 		valgrind_exe );
			exit( 1 );
		} else {
			executable = valgrindPath.Value();
		}
	} else {
		executable = deepOpts.strDagmanPath.Value();
	}

    fprintf(pSubFile, "# Filename: %s\n", shallowOpts.strSubFile.Value());

    fprintf(pSubFile, "# Generated by condor_submit_dag ");
	shallowOpts.dagFiles.rewind();
	char *dagFile;
	while ( (dagFile = shallowOpts.dagFiles.next()) != NULL ) {
    	fprintf(pSubFile, "%s ", dagFile);
	}
    fprintf(pSubFile, "\n");

    fprintf(pSubFile, "universe\t= scheduler\n");
    fprintf(pSubFile, "executable\t= %s\n", executable);
	fprintf(pSubFile, "getenv\t\t= True\n");
	fprintf(pSubFile, "output\t\t= %s\n", shallowOpts.strLibOut.Value());
    fprintf(pSubFile, "error\t\t= %s\n", shallowOpts.strLibErr.Value());
    fprintf(pSubFile, "log\t\t= %s\n", shallowOpts.strSchedLog.Value());
#if !defined ( WIN32 )
    fprintf(pSubFile, "remove_kill_sig\t= SIGUSR1\n" );
#endif
    fprintf(pSubFile, "+%s\t= \"%s =?= $(cluster)\"\n",
				ATTR_OTHER_JOB_REMOVE_REQUIREMENTS, ATTR_DAGMAN_JOB_ID );

		// ensure DAGMan is automatically requeued by the schedd if it
		// exits abnormally or is killed (e.g., during a reboot)
	const char *defaultRemoveExpr = "( ExitSignal =?= 11 || "
				"(ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))";
	MyString removeExpr(defaultRemoveExpr);
	char *tmpRemoveExpr = param( "DAGMAN_ON_EXIT_REMOVE" );
	if ( tmpRemoveExpr ) {
		removeExpr = tmpRemoveExpr;
		free(tmpRemoveExpr);
	}
    fprintf(pSubFile, "# Note: default on_exit_remove expression:\n");
	fprintf(pSubFile, "# %s\n", defaultRemoveExpr);
	fprintf(pSubFile, "# attempts to ensure that DAGMan is automatically\n");
	fprintf(pSubFile, "# requeued by the schedd if it exits abnormally or\n");
    fprintf(pSubFile, "# is killed (e.g., during a reboot).\n");
    fprintf(pSubFile, "on_exit_remove\t= %s\n", removeExpr.Value() );

    fprintf(pSubFile, "copy_to_spool\t= %s\n", shallowOpts.copyToSpool ?
				"True" : "False" );

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// Be sure to change MIN_SUBMIT_FILE_VERSION in dagman_main.cpp
	// if the arguments passed to condor_dagman change in an
	// incompatible way!!
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	ArgList args;

	if ( shallowOpts.runValgrind ) {
		args.AppendArg("--tool=memcheck");
		args.AppendArg("--leak-check=yes");
		args.AppendArg("--show-reachable=yes");
		args.AppendArg(deepOpts.strDagmanPath.Value());
	}

		// -p 0 causes DAGMan to run w/o a command socket (see gittrac #4987).
	args.AppendArg("-p");
	args.AppendArg("0");
	args.AppendArg("-f");
	args.AppendArg("-l");
	args.AppendArg(".");
	if ( shallowOpts.iDebugLevel != DEBUG_UNSET ) {
		args.AppendArg("-Debug");
		args.AppendArg(shallowOpts.iDebugLevel);
	}
	args.AppendArg("-Lockfile");
	args.AppendArg(shallowOpts.strLockFile.Value());
	args.AppendArg("-AutoRescue");
	args.AppendArg(deepOpts.autoRescue);
	args.AppendArg("-DoRescueFrom");
	args.AppendArg(deepOpts.doRescueFrom);

	shallowOpts.dagFiles.rewind();
	while ( (dagFile = shallowOpts.dagFiles.next()) != NULL ) {
		args.AppendArg("-Dag");
		args.AppendArg(dagFile);
	}

    if(shallowOpts.iMaxIdle != 0) 
	{
		args.AppendArg("-MaxIdle");
		args.AppendArg(shallowOpts.iMaxIdle);
    }

    if(shallowOpts.iMaxJobs != 0) 
	{
		args.AppendArg("-MaxJobs");
		args.AppendArg(shallowOpts.iMaxJobs);
    }

    if(shallowOpts.iMaxPre != 0) 
	{
		args.AppendArg("-MaxPre");
		args.AppendArg(shallowOpts.iMaxPre);
    }

    if(shallowOpts.iMaxPost != 0) 
	{
		args.AppendArg("-MaxPost");
		args.AppendArg(shallowOpts.iMaxPost);
    }

	if(shallowOpts.bNoEventChecks)
	{
		// strArgs += " -NoEventChecks";
		printf( "Warning: -NoEventChecks is ignored; please use "
					"the DAGMAN_ALLOW_EVENTS config parameter instead\n");
	}

	if(!shallowOpts.bPostRun)
	{
		args.AppendArg("-DontAlwaysRunPost");
	}

	if(deepOpts.bAllowLogError)
	{
		args.AppendArg("-AllowLogError");
	}

	if(deepOpts.useDagDir)
	{
		args.AppendArg("-UseDagDir");
	}

	if(deepOpts.suppress_notification)
	{
		args.AppendArg("-Suppress_notification");
	}
	else
	{
		args.AppendArg("-Dont_Suppress_notification");
	}

	if ( shallowOpts.doRecovery ) {
		args.AppendArg( "-DoRecov" );
	}

	args.AppendArg("-CsdVersion");
	args.AppendArg(CondorVersion());

	if(deepOpts.allowVerMismatch) {
		args.AppendArg("-AllowVersionMismatch");
	}

	if(shallowOpts.dumpRescueDag) {
		args.AppendArg("-DumpRescue");
	}

	if(deepOpts.bVerbose) {
		args.AppendArg("-Verbose");
	}

	if(deepOpts.bForce) {
		args.AppendArg("-Force");
	}

	if(deepOpts.strNotification != "") {
		args.AppendArg("-Notification");
		args.AppendArg(deepOpts.strNotification);
	}

	if(deepOpts.strDagmanPath != "") {
		args.AppendArg("-Dagman");
		args.AppendArg(deepOpts.strDagmanPath);
	}

	if(deepOpts.strOutfileDir != "") {
		args.AppendArg("-Outfile_dir");
		args.AppendArg(deepOpts.strOutfileDir);
	}

	if(deepOpts.updateSubmit) {
		args.AppendArg("-Update_submit");
	}

	if(deepOpts.importEnv) {
		args.AppendArg("-Import_env");
	}

	if( deepOpts.priority != 0 ) {
		args.AppendArg("-Priority");
		args.AppendArg(deepOpts.priority);
	}

	MyString arg_str,args_error;
	if(!args.GetArgsStringV1WackedOrV2Quoted(&arg_str,&args_error)) {
		fprintf(stderr,"Failed to insert arguments: %s",args_error.Value());
		exit(1);
	}
    fprintf(pSubFile, "arguments\t= %s\n", arg_str.Value());

	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	// Be sure to change MIN_SUBMIT_FILE_VERSION in dagman_main.cpp
	// if the environment passed to condor_dagman changes in an
	// incompatible way!!
	//~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	EnvFilter env;
	if ( deepOpts.importEnv ) {
		env.Import( );
	}
	env.SetEnv("_CONDOR_DAGMAN_LOG", shallowOpts.strDebugLog.Value());
	env.SetEnv("_CONDOR_MAX_DAGMAN_LOG=0");
	if ( shallowOpts.strScheddDaemonAdFile != "" ) {
		env.SetEnv("_CONDOR_SCHEDD_DAEMON_AD_FILE",
				   shallowOpts.strScheddDaemonAdFile.Value());
	}
	if ( shallowOpts.strScheddAddressFile != "" ) {
		env.SetEnv("_CONDOR_SCHEDD_ADDRESS_FILE",
				   shallowOpts.strScheddAddressFile.Value());
	}
	if ( shallowOpts.strConfigFile != "" ) {
		if ( access( shallowOpts.strConfigFile.Value(), F_OK ) != 0 ) {
			fprintf( stderr, "ERROR: unable to read config file %s "
						"(error %d, %s)\n",
						shallowOpts.strConfigFile.Value(), errno, strerror(errno) );
			exit(1);
		}
		env.SetEnv("_CONDOR_DAGMAN_CONFIG_FILE", shallowOpts.strConfigFile.Value());
	}

	MyString env_str;
	MyString env_errors;
	if(!env.getDelimitedStringV1RawOrV2Quoted(&env_str,&env_errors)) {
		fprintf(stderr,"Failed to insert environment: %s",env_errors.Value());
		exit(1);
	}
    fprintf(pSubFile, "environment\t= %s\n",env_str.Value());

    if(deepOpts.strNotification != "") 
	{	
		fprintf(pSubFile, "notification\t= %s\n", deepOpts.strNotification.Value());
    }

		// Append user-specified stuff to submit file...
		// ...first, the insert file, if any...
	if (shallowOpts.appendFile != "") {
		FILE *aFile = safe_fopen_wrapper_follow(shallowOpts.appendFile.Value(), "r");
		if (!aFile)
		{
			fprintf( stderr, "ERROR: unable to read submit append file (%s)\n",
				 	shallowOpts.appendFile.Value() );
			exit( 1 );
		}

		char *line;
		int lineno = 0;
		while ((line = getline_trim(aFile, lineno)) != NULL) {
    		fprintf(pSubFile, "%s\n", line);
		}

		fclose(aFile);
	}

		// ...now things specified directly on the command line.
	shallowOpts.appendLines.rewind();
	char *command;
	while ((command = shallowOpts.appendLines.next()) != NULL) {
    	fprintf(pSubFile, "%s\n", command);
	}

    fprintf(pSubFile, "queue\n");

	fclose(pSubFile);
}
Exemplo n.º 16
0
void
printClassAd( void )
{
	printf( "%s = \"%s\"\n", ATTR_VERSION, CondorVersion() );
	printf( "%s = True\n", ATTR_IS_DAEMON_CORE );
	printf( "%s = True\n", ATTR_HAS_FILE_TRANSFER );
	printf( "%s = True\n", ATTR_HAS_PER_FILE_ENCRYPTION );
	printf( "%s = True\n", ATTR_HAS_RECONNECT );
	printf( "%s = True\n", ATTR_HAS_MPI );
	printf( "%s = True\n", ATTR_HAS_TDP );
	printf( "%s = True\n", ATTR_HAS_JOB_DEFERRAL );

		/*
		  Attributes describing what kinds of Job Info Communicators
		  this starter has.  This is mostly for COD, but someday might
		  be useful to other people, too.  There's no need to
		  advertise the fact we've got a JICShadow, since all starters
		  always have and will be able to communicate with a shadow...
		*/

	printf( "%s = True\n", ATTR_HAS_JIC_LOCAL_CONFIG );
	printf( "%s = True\n", ATTR_HAS_JIC_LOCAL_STDIN );

	ClassAd *ad = java_detect();
	if(ad) {
		int gotone=0;
		float mflops;
		char *str = 0;

		if(ad->LookupString(ATTR_JAVA_VENDOR,&str)) {
			printf("%s = \"%s\"\n",ATTR_JAVA_VENDOR,str);
			free(str);
			str = 0;
			gotone++;
		}
		if(ad->LookupString(ATTR_JAVA_VERSION,&str)) {
			printf("%s = \"%s\"\n",ATTR_JAVA_VERSION,str);
			free(str);
			str = 0;
			gotone++;
		}
		if(ad->LookupString("JavaSpecificationVersion",&str)) {
			printf("JavaSpecificationVersion = \"%s\"\n",str);
			free(str);
			str = 0;
			gotone++;
		}
		if(ad->LookupFloat(ATTR_JAVA_MFLOPS,mflops)) {
			printf("%s = %f\n", ATTR_JAVA_MFLOPS,mflops);
			gotone++;
		}
		if(gotone>0) printf( "%s = True\n",ATTR_HAS_JAVA);		
		delete ad;
	}

	// VM universe stuff
	if( VMProc::vm_univ_detect() ) {
		// This doesn't mean that vm universe is really available.
		// This just means that starter has codes for vm universe.
		// Actual testing for vm universe will be 
		//  done by vmuniverse manager in startd.
		// ATTR_HAS_VM may be overwritten by vmuniverse manager in startd
		printf( "%s = True\n",ATTR_HAS_VM);		
	}

	// Advertise which file transfer plugins are supported
	FileTransfer ft;
	CondorError e;
	ft.InitializePlugins(e);
	if (e.code()) {
		dprintf(D_ALWAYS, "WARNING: Initializing plugins returned: %s\n", e.getFullText().c_str());
	}

	MyString method_list = ft.GetSupportedMethods();
	if (!method_list.IsEmpty()) {
		printf("%s = \"%s\"\n", ATTR_HAS_FILE_TRANSFER_PLUGIN_METHODS, method_list.Value());
	}

#if defined(WIN32)
		// Advertise our ability to run jobs as the submitting user
	printf("%s = True\n", ATTR_HAS_WIN_RUN_AS_OWNER);
#endif
}
Exemplo n.º 17
0
bool 
DCSchedd::spoolJobFiles(int JobAdsArrayLen, ClassAd* JobAdsArray[], CondorError * errstack)
{
	int reply;
	int i;
	ReliSock rsock;
	bool use_new_command = true;

	if ( version() ) {
		CondorVersionInfo vi( version() );
		if ( vi.built_since_version(6,7,7) ) {
			use_new_command = true;
		} else {
			use_new_command = false;
		}
	}

		// // // // // // // //
		// On the wire protocol
		// // // // // // // //

	rsock.timeout(20);   // years of research... :)
	if( ! rsock.connect(_addr) ) {
		std::string errmsg;
		formatstr(errmsg, "Failed to connect to schedd (%s)", _addr);

		dprintf( D_ALWAYS, "DCSchedd::spoolJobFiles: %s\n", errmsg.c_str() );

		if( errstack ) {
			errstack->push(
				"DCSchedd::spoolJobFiles",CEDAR_ERR_CONNECT_FAILED,
				errmsg.c_str() );
		}
		return false;
	}
	if ( use_new_command ) {
		if( ! startCommand(SPOOL_JOB_FILES_WITH_PERMS, (Sock*)&rsock, 0,
						   errstack) ) {

			dprintf( D_ALWAYS, "DCSchedd::spoolJobFiles: "
				"Failed to send command (SPOOL_JOB_FILES_WITH_PERMS) "
				"to the schedd (%s)\n", _addr );

			return false;
		}
	} else {
		if( ! startCommand(SPOOL_JOB_FILES, (Sock*)&rsock, 0, errstack) ) {
			dprintf( D_ALWAYS, "DCSchedd::spoolJobFiles: "
					 "Failed to send command (SPOOL_JOB_FILES) "
					 "to the schedd (%s)\n", _addr );

			return false;
		}
	}

		// First, if we're not already authenticated, force that now. 
	if (!forceAuthentication( &rsock, errstack )) {
		dprintf( D_ALWAYS, "DCSchedd: authentication failure: %s\n",
				 errstack ? errstack->getFullText().c_str() : "" );
		return false;
	}

	rsock.encode();

		// Send our version if using the new command
	if ( use_new_command ) {
			// Need to use a named variable, else the wrong version of	
			// code() is called.
		char *my_version = strdup( CondorVersion() );
		if ( !rsock.code(my_version) ) {
			dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: "
					"Can't send version string to the schedd\n");
			free( my_version );
			return false;
		}
		free( my_version );
	}

		// Send the number of jobs
	if ( !rsock.code(JobAdsArrayLen) ) {
		dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: "
				"Can't send JobAdsArrayLen to the schedd\n");
		return false;
	}

	if( !rsock.end_of_message() ) {
		std::string errmsg;
		formatstr(errmsg,
				"Can't send initial message (version + count) to schedd (%s)",
				_addr);

		dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: %s\n", errmsg.c_str());

		if( errstack ) {
			errstack->push(
				"DCSchedd::spoolJobFiles",
				CEDAR_ERR_EOM_FAILED,
				errmsg.c_str());
		}
		return false;
	}

		// Now, put the job ids onto the wire
	PROC_ID jobid;
	for (i=0; i<JobAdsArrayLen; i++) {
		if (!JobAdsArray[i]->LookupInteger(ATTR_CLUSTER_ID,jobid.cluster)) {
			dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: "
					"Job ad %d did not have a cluster id\n",i);
			return false;
		}
		if (!JobAdsArray[i]->LookupInteger(ATTR_PROC_ID,jobid.proc)) {
			dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: "
					"Job ad %d did not have a proc id\n",i);
			return false;
		}
		rsock.code(jobid);
	}

	if( !rsock.end_of_message() ) {
		std::string errmsg;
		formatstr(errmsg, "Failed while sending job ids to schedd (%s)", _addr);

		dprintf(D_ALWAYS,"DCSchedd:spoolJobFiles: %s\n", errmsg.c_str());

		if( errstack ) {
			errstack->push(
				"DCSchedd::spoolJobFiles",
				CEDAR_ERR_EOM_FAILED,
				errmsg.c_str());
		}
		return false;
	}

		// Now send all the files via the file transfer object
	for (i=0; i<JobAdsArrayLen; i++) {
		FileTransfer ftrans;
		if ( !ftrans.SimpleInit(JobAdsArray[i], false, false, &rsock, PRIV_UNKNOWN, false, true) ) {
			if( errstack ) {
				int cluster = -1, proc = -1;
				if(JobAdsArray[i]) {
					JobAdsArray[i]->LookupInteger(ATTR_CLUSTER_ID,cluster);
					JobAdsArray[i]->LookupInteger(ATTR_PROC_ID,proc);
				}
				errstack->pushf(
					"DCSchedd::spoolJobFiles",
					FILETRANSFER_INIT_FAILED,
					"File transfer initialization failed for target job %d.%d",
					cluster, proc );
			}
			return false;
		}
		if ( use_new_command ) {
			ftrans.setPeerVersion( version() );
		}
		if ( !ftrans.UploadFiles(true,false) ) {
			if( errstack ) {
				FileTransfer::FileTransferInfo ft_info = ftrans.GetInfo();

				int cluster = -1, proc = -1;
				if(JobAdsArray[i]) {
					JobAdsArray[i]->LookupInteger(ATTR_CLUSTER_ID,cluster);
					JobAdsArray[i]->LookupInteger(ATTR_PROC_ID,proc);
				}
				errstack->pushf(
					"DCSchedd::spoolJobFiles",
					FILETRANSFER_UPLOAD_FAILED,
					"File transfer failed for target job %d.%d: %s",
					cluster, proc, ft_info.error_desc.Value() );
			}
			return false;
		}
	}	
		
		
	rsock.end_of_message();

	rsock.decode();

	reply = 0;
	rsock.code(reply);
	rsock.end_of_message();

	if ( reply == 1 ) 
		return true;
	else
		return false;
}
Exemplo n.º 18
0
// using jobids structure so the schedd doesn't have to iterate over all the
// job ads.
bool 
DCSchedd::requestSandboxLocation(int direction, 
	int JobAdsArrayLen, ClassAd* JobAdsArray[], int protocol, 
	ClassAd *respad, CondorError * errstack)
{
	StringList sl;
	ClassAd reqad;
	std::string str;
	int i;
	int cluster, proc;
	char *tmp = NULL;

	////////////////////////////////////////////////////////////////////////
	// This request knows exactly which jobs it wants to talk about, so
	// just compact them into the classad to send to the schedd.
	////////////////////////////////////////////////////////////////////////

	reqad.Assign(ATTR_TREQ_DIRECTION, direction);
	reqad.Assign(ATTR_TREQ_PEER_VERSION, CondorVersion());
	reqad.Assign(ATTR_TREQ_HAS_CONSTRAINT, false);

	for (i = 0; i < JobAdsArrayLen; i++) {
		if (!JobAdsArray[i]->LookupInteger(ATTR_CLUSTER_ID,cluster)) {
			dprintf(D_ALWAYS,"DCSchedd:requestSandboxLocation: "
					"Job ad %d did not have a cluster id\n",i);
			return false;
		}
		if (!JobAdsArray[i]->LookupInteger(ATTR_PROC_ID,proc)) {
			dprintf(D_ALWAYS,"DCSchedd:requestSandboxLocation(): "
					"Job ad %d did not have a proc id\n",i);
			return false;
		}
		
		formatstr(str, "%d.%d", cluster, proc);

		// make something like: 1.0, 1.1, 1.2, ....
		sl.append(str.c_str());
	}

	tmp = sl.print_to_string();

	reqad.Assign(ATTR_TREQ_JOBID_LIST, tmp);

	free(tmp);
	tmp = NULL;

	// XXX This should be a realized function to convert between the
	// protocol enum and a string description. That way both functions can
	// use it and it won't seem so bad.
	switch(protocol) {
		case FTP_CFTP:
			reqad.Assign(ATTR_TREQ_FTP, FTP_CFTP);
			break;
		default:
			dprintf(D_ALWAYS, "DCSchedd::requestSandboxLocation(): "
				"Can't make a request for a sandbox with an unknown file "
				"transfer protocol!");
			return false;
			break;
	}

	// now connect to the schedd and get the response.
	return requestSandboxLocation(&reqad, respad, errstack);
}
Exemplo n.º 19
0
/* Utility function to create a generic job ad. The caller can then fill
 * in the relevant details.
 */
ClassAd *CreateJobAd( const char *owner, int universe, const char *cmd )
{
	ClassAd *job_ad = new ClassAd();

	SetMyTypeName(*job_ad, JOB_ADTYPE);
	SetTargetTypeName(*job_ad, STARTD_ADTYPE);

	if ( owner ) {
		job_ad->Assign( ATTR_OWNER, owner );
	} else {
		job_ad->AssignExpr( ATTR_OWNER, "Undefined" );
	}
	job_ad->Assign( ATTR_JOB_UNIVERSE, universe );
	job_ad->Assign( ATTR_JOB_CMD, cmd );

	job_ad->Assign( ATTR_Q_DATE, (int)time(NULL) );
	job_ad->Assign( ATTR_COMPLETION_DATE, 0 );

	job_ad->Assign( ATTR_JOB_REMOTE_WALL_CLOCK, 0.0 );
	job_ad->Assign( ATTR_JOB_LOCAL_USER_CPU, 0.0 );
	job_ad->Assign( ATTR_JOB_LOCAL_SYS_CPU, 0.0 );
	job_ad->Assign( ATTR_JOB_REMOTE_USER_CPU, 0.0 );
	job_ad->Assign( ATTR_JOB_REMOTE_SYS_CPU, 0.0 );

		// This is a magic cookie, see how condor_submit sets it
	job_ad->Assign( ATTR_CORE_SIZE, -1 );

		// Are these ones really necessary?
	job_ad->Assign( ATTR_JOB_EXIT_STATUS, 0 );
	job_ad->Assign( ATTR_ON_EXIT_BY_SIGNAL, false );

	job_ad->Assign( ATTR_NUM_CKPTS, 0 );
	job_ad->Assign( ATTR_NUM_JOB_STARTS, 0 );
	job_ad->Assign( ATTR_NUM_RESTARTS, 0 );
	job_ad->Assign( ATTR_NUM_SYSTEM_HOLDS, 0 );
	job_ad->Assign( ATTR_JOB_COMMITTED_TIME, 0 );
	job_ad->Assign( ATTR_CUMULATIVE_SLOT_TIME, 0 );
	job_ad->Assign( ATTR_COMMITTED_SLOT_TIME, 0 );
	job_ad->Assign( ATTR_TOTAL_SUSPENSIONS, 0 );
	job_ad->Assign( ATTR_LAST_SUSPENSION_TIME, 0 );
	job_ad->Assign( ATTR_CUMULATIVE_SUSPENSION_TIME, 0 );
	job_ad->Assign( ATTR_COMMITTED_SUSPENSION_TIME, 0 );

	job_ad->Assign( ATTR_JOB_ROOT_DIR, "/" );

	job_ad->Assign( ATTR_MIN_HOSTS, 1 );
	job_ad->Assign( ATTR_MAX_HOSTS, 1 );
	job_ad->Assign( ATTR_CURRENT_HOSTS, 0 );

	job_ad->Assign( ATTR_WANT_REMOTE_SYSCALLS, false );
	job_ad->Assign( ATTR_WANT_CHECKPOINT, false );
	job_ad->Assign( ATTR_WANT_REMOTE_IO, true );

	job_ad->Assign( ATTR_JOB_STATUS, IDLE );
	job_ad->Assign( ATTR_ENTERED_CURRENT_STATUS, (int)time(NULL) );

	job_ad->Assign( ATTR_JOB_PRIO, 0 );
	job_ad->Assign( ATTR_NICE_USER, false );

	job_ad->Assign( ATTR_JOB_NOTIFICATION, NOTIFY_NEVER );

	

	job_ad->Assign( ATTR_IMAGE_SIZE, 100 );

	job_ad->Assign( ATTR_JOB_IWD, "/tmp" );
	job_ad->Assign( ATTR_JOB_INPUT, NULL_FILE );
	job_ad->Assign( ATTR_JOB_OUTPUT, NULL_FILE );
	job_ad->Assign( ATTR_JOB_ERROR, NULL_FILE );

		// Not sure what to do with these. If stdin/out/err is unset in the
		// submit file, condor_submit sets In/Out/Err to the NULL_FILE and
		// these transfer attributes to false. Otherwise, it leaves the
		// transfer attributes unset (which is treated as true). If we
		// explicitly set these to false here, our caller needs to reset
		// them to true if it changes In/Out/Err and wants the default
		// behavior of transfering them. This will probably be a common
		// oversite. Leaving them unset should be safe if our caller doesn't
		// change In/Out/Err.
	//job_ad->Assign( ATTR_TRANSFER_INPUT, false );
	//job_ad->Assign( ATTR_TRANSFER_OUTPUT, false );
	//job_ad->Assign( ATTR_TRANSFER_ERROR, false );
	//job_ad->Assign( ATTR_TRANSFER_EXECUTABLE, false );

	job_ad->Assign( ATTR_BUFFER_SIZE, 512*1024 );
	job_ad->Assign( ATTR_BUFFER_BLOCK_SIZE, 32*1024 );

	job_ad->Assign( ATTR_SHOULD_TRANSFER_FILES,
					getShouldTransferFilesString( STF_YES ) );
	job_ad->Assign( ATTR_WHEN_TO_TRANSFER_OUTPUT,
					getFileTransferOutputString( FTO_ON_EXIT ) );

	job_ad->Assign( ATTR_REQUIREMENTS, true );

	job_ad->Assign( ATTR_PERIODIC_HOLD_CHECK, false );
	job_ad->Assign( ATTR_PERIODIC_REMOVE_CHECK, false );
	job_ad->Assign( ATTR_PERIODIC_RELEASE_CHECK, false );

	job_ad->Assign( ATTR_ON_EXIT_HOLD_CHECK, false );
	job_ad->Assign( ATTR_ON_EXIT_REMOVE_CHECK, true );

	job_ad->Assign( ATTR_JOB_ARGUMENTS1, "" );

	job_ad->Assign( ATTR_JOB_LEAVE_IN_QUEUE, false );

	job_ad->AssignExpr( ATTR_REQUEST_MEMORY, "ifthenelse(MemoryUsage isnt undefined,MemoryUsage,( ImageSize + 1023 ) / 1024)" );
	job_ad->AssignExpr( ATTR_REQUEST_DISK, "DiskUsage" );
	job_ad->Assign( ATTR_DISK_USAGE, 1 );
	job_ad->Assign( ATTR_REQUEST_CPUS, 1 );

	// Without these, the starter will not automatically remap the stdout/err (look at sha 422735d9)
	// and possibly won't put them in the correct directory.
	job_ad->Assign( ATTR_STREAM_OUTPUT, false );
	job_ad->Assign( ATTR_STREAM_ERROR, false );

	job_ad->Assign( ATTR_VERSION, CondorVersion() );
	job_ad->Assign( ATTR_PLATFORM, CondorPlatform() );

	job_ad->Assign( ATTR_Q_DATE, time(NULL) );

	return job_ad;
}
Exemplo n.º 20
0
//---------------------------------------------------------------------------
void
parseCommandLine(SubmitDagDeepOptions &deepOpts,
			SubmitDagShallowOptions &shallowOpts, int argc,
			const char * const argv[])
{
	for (int iArg = 1; iArg < argc; iArg++)
	{
		MyString strArg = argv[iArg];

		if (strArg[0] != '-')
		{
				// We assume an argument without a leading hyphen is
				// a DAG file name.
			shallowOpts.dagFiles.append(strArg.Value());
			if ( shallowOpts.primaryDagFile == "" ) {
				shallowOpts.primaryDagFile = strArg;
			}
		}
		else if (shallowOpts.primaryDagFile != "")
		{
				// Disallow hyphen args after DAG file name(s).
			printf("ERROR: no arguments allowed after DAG file name(s)\n");
			printUsage();
		}
		else
		{
			strArg.lower_case();

			// Note: in checking the argument names here, we only check for
			// as much of the full name as we need to unambiguously define
			// the argument.
			if (strArg.find("-no_s") != -1) // -no_submit
			{
				shallowOpts.bSubmit = false;
			}
			else if (strArg.find("-vers") != -1) // -version
			{
				printf( "%s\n%s\n", CondorVersion(), CondorPlatform() );
				exit( 0 );
			}
			else if (strArg.find("-help") != -1 || strArg.find("-h") != -1) // -help
			{
				printUsage(0);
			}
				// submit and stick to a specific schedd
			else if (strArg.find("-schedd-daemon-ad-file") != -1)
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-schedd-daemon-ad-file argument needs a value\n");
					printUsage();
				}
				shallowOpts.strScheddDaemonAdFile = argv[++iArg];
			}
				// submit and stick to a specific schedd
			else if (strArg.find("-schedd-address-file") != -1)
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-schedd-address-file argument needs a value\n");
					printUsage();
				}
				shallowOpts.strScheddAddressFile = argv[++iArg];
			}
			else if (strArg.find("-f") != -1) // -force
			{
				deepOpts.bForce = true;
			}
			else if (strArg.find("-not") != -1) // -notification
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-notification argument needs a value\n");
					printUsage();
				}
				deepOpts.strNotification = argv[++iArg];
			}
			else if (strArg.find("-r") != -1) // submit to remote schedd
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-r argument needs a value\n");
					printUsage();
				}
				shallowOpts.strRemoteSchedd = argv[++iArg];
			}
			else if (strArg.find("-dagman") != -1)
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-dagman argument needs a value\n");
					printUsage();
				}
				deepOpts.strDagmanPath = argv[++iArg];
			}
			else if (strArg.find("-de") != -1) // -debug
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-debug argument needs a value\n");
					printUsage();
				}
				shallowOpts.iDebugLevel = atoi(argv[++iArg]);
			}
			else if (strArg.find("-noev") != -1) // -noeventchecks
			{
				shallowOpts.bNoEventChecks = true;
			}
			else if (strArg.find("-allowlog") != -1) // -allowlogerror
			{
				deepOpts.bAllowLogError = true;
			}
			else if (strArg.find("-use") != -1) // -usedagdir
			{
				deepOpts.useDagDir = true;
			}
			else if (strArg.find("-out") != -1) // -outfile_dir
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-outfile_dir argument needs a value\n");
					printUsage();
				}
				deepOpts.strOutfileDir = argv[++iArg];
			}
			else if (strArg.find("-con") != -1) // -config
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-config argument needs a value\n");
					printUsage();
				}
				shallowOpts.strConfigFile = argv[++iArg];
					// Internally we deal with all configuration file paths
					// as full paths, to make it easier to determine whether
					// several paths point to the same file.
				MyString	errMsg;
				if (!MakePathAbsolute(shallowOpts.strConfigFile, errMsg)) {
					fprintf( stderr, "%s\n", errMsg.Value() );
   					exit( 1 );
				}
			}
			else if (strArg.find("-app") != -1) // -append
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-append argument needs a value\n");
					printUsage();
				}
				shallowOpts.appendLines.append(argv[++iArg]);
			}
			else if (strArg.find("-insert") != -1) // -insert_sub_file
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-insert_sub_file argument needs a value\n");
					printUsage();
				}
				++iArg;
				if (shallowOpts.appendFile != "") {
					printf("Note: -insert_sub_file value (%s) overriding "
								"DAGMAN_INSERT_SUB_FILE setting (%s)\n",
								argv[iArg], shallowOpts.appendFile.Value());
				}
				shallowOpts.appendFile = argv[iArg];
			}
			else if (strArg.find("-autor") != -1) // -autorescue
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-autorescue argument needs a value\n");
					printUsage();
				}
				deepOpts.autoRescue = (atoi(argv[++iArg]) != 0);
			}
			else if (strArg.find("-dores") != -1) // -dorescuefrom
			{
				if (iArg + 1 >= argc) {
					fprintf(stderr, "-dorescuefrom argument needs a value\n");
					printUsage();
				}
				deepOpts.doRescueFrom = atoi(argv[++iArg]);
			}
			else if (strArg.find("-allowver") != -1) // -AllowVersionMismatch
			{
				deepOpts.allowVerMismatch = true;
			}
			else if (strArg.find("-no_rec") != -1) // -no_recurse
			{
				deepOpts.recurse = false;
			}
			else if (strArg.find("-do_rec") != -1) // -do_recurse
			{
				deepOpts.recurse = true;
			}
			else if (strArg.find("-updat") != -1) // -update_submit
			{
				deepOpts.updateSubmit = true;
			}
			else if (strArg.find("-import_env") != -1) // -import_env
			{
				deepOpts.importEnv = true;
			}			     
			else if (strArg.find("-dumpr") != -1) // -DumpRescue
			{
				shallowOpts.dumpRescueDag = true;
			}
			else if (strArg.find("-valgrind") != -1) // -valgrind
			{
				shallowOpts.runValgrind = true;
			}
				// This must come last, so we can have other arguments
				// that start with -v.
			else if ( (strArg.find("-v") != -1) ) // -verbose
			{
				deepOpts.bVerbose = true;
			}
			else if ( (strArg.find("-dontalwaysrun") != -1) ) // DontAlwaysRunPost
			{
				shallowOpts.bPostRun = false;
			}
			else if ( (strArg.find("-dont_use_default_node_log") != -1) )
			{
				fprintf( stderr, "Error: -dont_use_default_node_log is no longer allowed\n" );
				printUsage();
			}
			else if ( (strArg.find("-suppress_notification") != -1) )
			{
				deepOpts.suppress_notification = true;
			}
			else if ( (strArg.find("-dont_suppress_notification") != -1) )
			{
				deepOpts.suppress_notification = false;
			}
			else if( (strArg.find("-prio") != -1) ) // -priority
			{
				if(iArg + 1 >= argc) {
					fprintf(stderr, "-priority argument needs a value\n");
					printUsage();
				}
				deepOpts.priority = atoi(argv[++iArg]);
			}
			else if ( (strArg.find("-dorecov") != -1) )
			{
				shallowOpts.doRecovery = true;
			}
			else if ( parsePreservedArgs( strArg, iArg, argc, argv,
						shallowOpts) )
			{
				// No-op here
			}
			else
			{
				fprintf( stderr, "ERROR: unknown option %s\n", strArg.Value() );
				printUsage();
			}
		}
	}

	if (shallowOpts.primaryDagFile == "")
	{
		fprintf( stderr, "ERROR: no dag file specified; aborting.\n" );
		printUsage();
	}

	if (deepOpts.doRescueFrom < 0)
	{
		fprintf( stderr, "-dorescuefrom value must be non-negative; aborting.\n");
		printUsage();
	}
}
Exemplo n.º 21
0
int
stdin_pipe_handler(Service*, int) {

	std::string* line;
	while ((line = stdin_buffer.GetNextLine()) != NULL) {

		const char * command = line->c_str();

		// CREATE_CONDOR_SECURITY_SESSION contains sensitive data that
		// normally shouldn't be written to a publically-readable log.
		// We should conceal it unless GAHP_DEBUG_HIDE_SENSITIVE_DATA
		// says not to.
		if ( param_boolean( "GAHP_DEBUG_HIDE_SENSITIVE_DATA", true ) &&
			 strncmp( command, GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					  strlen( GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION ) ) == 0 ) {
			dprintf( D_ALWAYS, "got stdin: %s XXXXXXXX\n",
					 GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION );
		} else {
			dprintf (D_ALWAYS, "got stdin: %s\n", command);
		}

		Gahp_Args args;

		if (parse_gahp_command (command, &args) &&
			verify_gahp_command (args.argv, args.argc)) {

				// Catch "special commands first
			if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) {
					// Print number of results
				std::string rn_buff;
				formatstr( rn_buff, "%d", result_list.number() );
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					rn_buff.c_str() };
				gahp_output_return (commands, 2);

					// Print each result line
				char * next;
				result_list.rewind();
				while ((next = result_list.next()) != NULL) {
					printf ("%s\n", next);
					fflush(stdout);
					dprintf(D_FULLDEBUG,"put stdout: %s\n",next);
					result_list.deleteCurrent();
				}

				new_results_signaled = FALSE;
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) {
				printf ("S %s\n", version);
				fflush (stdout);
				dprintf(D_FULLDEBUG,"put stdout: S %s\n",version);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				DC_Exit(0);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) {
				async_mode = TRUE;
				new_results_signaled = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) {
				async_mode = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				return 0; // exit
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) {
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					GAHP_COMMAND_DOWNLOAD_SANDBOX,
					GAHP_COMMAND_UPLOAD_SANDBOX,
					GAHP_COMMAND_DESTROY_SANDBOX,
					GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					GAHP_COMMAND_CONDOR_VERSION,
					GAHP_COMMAND_ASYNC_MODE_ON,
					GAHP_COMMAND_ASYNC_MODE_OFF,
					GAHP_COMMAND_RESULTS,
					GAHP_COMMAND_QUIT,
					GAHP_COMMAND_VERSION,
					GAHP_COMMAND_COMMANDS};
				gahp_output_return (commands, 12);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION) == 0) {
				ClaimIdParser claimid( args.argv[1] );
				if ( !daemonCore->getSecMan()->CreateNonNegotiatedSecuritySession(
										DAEMON,
										claimid.secSessionId(),
										claimid.secSessionKey(),
										claimid.secSessionInfo(),
										CONDOR_PARENT_FQU,
										NULL,
										0 ) ) {
					gahp_output_return_error();
				} else {
					sec_session_id = claimid.secSessionId();
					gahp_output_return_success();
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CONDOR_VERSION) == 0) {
				peer_condor_version = args.argv[1];

				const char *reply [] = { GAHP_RESULT_SUCCESS,
										 escapeGahpString( CondorVersion() ) };
				gahp_output_return( reply, 2 );

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[2] = {
						"Worker thread failed",
						"NULL"
					};
					enqueue_result(args.argv[1], res, 2);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else {
				// should never get here if verify does its job
				dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]);
				gahp_output_return_error();
			}
			
		} else {
			gahp_output_return_error();
		}

		delete line;
	}

	// check if GetNextLine() returned NULL because of an error or EOF
	if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) {
		dprintf (D_ALWAYS, "stdin buffer closed, exiting\n");
		DC_Exit (1);
	}

	return TRUE;
}
Exemplo n.º 22
0
void
version()
{
	printf( "%s\n%s\n", CondorVersion(), CondorPlatform() );
	exit( 0 );
}
Exemplo n.º 23
0
int
main(int argc, char *argv[])
{

	myDistro->Init( argc, argv );
	CondorVersionInfo *version;

	bool use_syscall_lib = false;
	bool print_arch = false;
	bool print_opsys = false;
	bool opsys_first = false;		// stupid flag for maintaining the
									// order we saw the args in...

	for(int i = 1; i < argc; i++ ) {
		if( argv[i][0] != '-' ) {
			fprintf( stderr, "ERROR: invalid argument: '%s'\n", argv[i] );
			usage( argv[0], 1 );
		}
		switch( argv[i][1] ) {
		case 's':
			use_syscall_lib = true;
			break;
		case 'a':
			print_arch = true;
			break;
		case 'o':
			print_opsys = true;
			if( ! print_arch ) {
				opsys_first = true;
			}
			break;
		case 'h':
			usage( argv[0], 0 );
			break;
		default:
			fprintf( stderr, "ERROR: unrecognized argument: '%s'\n", argv[i] );
			usage( argv[0], 1 );
		}
	}

	char *path=NULL, *fullpath=NULL, *vername=NULL, *platform=NULL;
	if( use_syscall_lib ) {
		config();
		path = param( "LIB" );
		if( path == NULL ) {
			fprintf( stderr, "ERROR: -syscall_lib specified but 'LIB' not "
					 "defined in configuration!\n" );
			usage( argv[0], 1 );
		}
		fullpath = (char *)malloc(strlen(path) + 24);
		ASSERT( fullpath != NULL );
		strcpy(fullpath, path);
		strcat(fullpath, "/libcondorsyscall.a");
				
		vername = NULL;
		vername = CondorVersionInfo::get_version_from_file(fullpath, vername);
		platform = NULL;
		platform = CondorVersionInfo::get_platform_from_file(fullpath, platform);

		version = new CondorVersionInfo(vername, NULL, platform);
		free(path);
		free(fullpath);
	} else {
		version = new CondorVersionInfo;
	}

	if( opsys_first ) {
		assert( print_opsys );
		printf("%s\n", version->getOpSysVer() );
	}
	if( print_arch ) {
		printf("%s\n", version->getArchVer() );
	}
	if( ! opsys_first && print_opsys ) {
		printf("%s\n", version->getOpSysVer() );
	}

	if( print_arch || print_opsys ) {
		return 0;
	}

	if( use_syscall_lib ) {
		printf( "%s\n%s\n", vername, platform );
	} else { 
		printf( "%s\n%s\n", CondorVersion(), CondorPlatform() );
	}

	delete( version );

	return 0;
}