void handleConstraints( void ) { if( ! has_constraint ) { return; } const char* tmp = global_constraint.Value(); CondorError errstack; if( doWorkByConstraint(tmp, &errstack) ) { fprintf( stdout, "Jobs matching constraint %s %s\n", tmp, (mode == JA_REMOVE_JOBS) ? "have been marked for removal" : (mode == JA_REMOVE_X_JOBS) ? "have been removed locally (remote state unknown)" : actionWord(mode,true) ); } else { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if (had_error) { fprintf( stderr, "Couldn't find/%s all jobs matching constraint %s\n", actionWord(mode,false), tmp ); } } }
// This function calls up the schedd passed in on the command line and // registers the transferd as being available for the schedd's use. RegisterResult TransferD::register_to_schedd(ReliSock **regsock_ptr) { CondorError errstack; MyString sname; MyString id; MyString sinful; bool rval; if (*regsock_ptr != NULL) { *regsock_ptr = NULL; } sname = m_features.get_schedd_sinful(); id = m_features.get_id(); if (sname == "N/A") { // no schedd supplied with which to register dprintf(D_ALWAYS, "No schedd specified to which to register.\n"); return REG_RESULT_NO_SCHEDD; } // what is my sinful string? sinful = daemonCore->InfoCommandSinfulString(-1); dprintf(D_FULLDEBUG, "Registering myself(%s) to schedd(%s)\n", sinful.Value(), sname.Value()); // hook up to the schedd. DCSchedd schedd(sname.Value(), NULL); // register myself, give myself 1 minute to connect. rval = schedd.register_transferd(sinful, id, 20*3, regsock_ptr, &errstack); if (rval == false) { // emit why dprintf(D_ALWAYS, "TransferRequest::register_to_schedd(): Failed to " "register. Schedd gave reason '%s'\n", errstack.getFullText().c_str()); return REG_RESULT_FAILED; } // WARNING WARNING WARNING WARNING // // WARNING WARNING WARNING WARNING // // WARNING WARNING WARNING WARNING // // WARNING WARNING WARNING WARNING // // WARNING WARNING WARNING WARNING // // Here, I must infact go back to daemon core without closing or doing // anything with the socket. This is because the schedd is going to // reconnect back to me, and I can't deadlock. dprintf(D_FULLDEBUG, "Succesfully registered, awaiting treq channel message....\n"); return REG_RESULT_SUCCESS; }
bool DCMaster::sendMasterCommand( bool insure_update, int my_cmd ) { CondorError errstack; int master_cmd = my_cmd; dprintf( D_FULLDEBUG, "DCMaster::sendMasterCommand: Just starting... \n"); /* have we located the required master yet? */ if( ! _addr ) { locate(); } if( ! m_master_safesock && ! insure_update ) { m_master_safesock = new SafeSock; m_master_safesock->timeout(20); // years of research... :) if( ! m_master_safesock->connect(_addr) ) { dprintf( D_ALWAYS, "sendMasterCommand: Failed to connect to master " "(%s)\n", _addr ); delete m_master_safesock; m_master_safesock = NULL; return false; } } ReliSock reli_sock; bool result; if( insure_update ) { // For now, if we have to ensure that the update gets // there, we use a ReliSock (TCP). reli_sock.timeout(20); // years of research... :) if( ! reli_sock.connect(_addr) ) { dprintf( D_ALWAYS, "sendMasterCommand: Failed to connect to master " "(%s)\n", _addr ); return false; } result = sendCommand( master_cmd, (Sock*)&reli_sock, 0, &errstack ); } else { result = sendCommand( master_cmd, (Sock*)m_master_safesock, 0, &errstack ); } if( ! result ) { dprintf( D_FULLDEBUG, "Failed to send %d command to master\n",master_cmd ); if( m_master_safesock ) { delete m_master_safesock; m_master_safesock = NULL; } if( errstack.code() != 0 ) { dprintf( D_ALWAYS, "ERROR: %s\n", errstack.getFullText() ); } return false; } return true; }
// Called when the schedd initially connects to the transferd to finish // the registration process. int TransferD::setup_transfer_request_handler(int /*cmd*/, Stream *sock) { ReliSock *rsock = (ReliSock*)sock; MyString sock_id; dprintf(D_ALWAYS, "Got TRANSFER_CONTROL_CHANNEL!\n"); rsock->decode(); /////////////////////////////////////////////////////////////// // make sure we are authenticated /////////////////////////////////////////////////////////////// if( ! rsock->triedAuthentication() ) { CondorError errstack; if( ! SecMan::authenticate_sock(rsock, WRITE, &errstack) ) { // we failed to authenticate, we should bail out now // since we don't know what user is trying to perform // this action. // TODO: it'd be nice to print out what failed, but we // need better error propagation for that... errstack.push( "TransferD::setup_transfer_request_handler()", 42, "Failure to register transferd - Authentication failed" ); dprintf( D_ALWAYS, "setup_transfer_request_handler() " "aborting: %s\n", errstack.getFullText().c_str() ); refuse(rsock); return CLOSE_STREAM; } } rsock->decode(); /////////////////////////////////////////////////////////////// // Register this socket with a socket handler to handle incoming requests /////////////////////////////////////////////////////////////// sock_id += "<TreqChannel-Socket>"; char* _sock_id = strdup( sock_id.Value() ); //de-const // register the handler for any future transfer requests on this socket. daemonCore->Register_Socket((Sock*)rsock, _sock_id, (SocketHandlercpp)&TransferD::accept_transfer_request_handler, "TransferD::accept_transfer_request_handler", this, ALLOW); free( _sock_id ); dprintf(D_ALWAYS, "Treq channel established.\n"); dprintf(D_ALWAYS, "Accepting Transfer Requests.\n"); return KEEP_STREAM; }
//--------------------------------------------------------------------------- Qmgr_connection * DagmanClassad::OpenConnection() { // Open job queue CondorError errstack; Qmgr_connection *queue = ConnectQ( _schedd->addr(), 0, false, &errstack, NULL, _schedd->version() ); if ( !queue ) { debug_printf( DEBUG_QUIET, "WARNING: failed to connect to queue manager (%s)\n", errstack.getFullText().c_str() ); check_warning_strictness( DAG_STRICT_3 ); return NULL; } return queue; }
DCStarter::X509UpdateStatus DCStarter::delegateX509Proxy( const char * filename, time_t expiration_time, char const *sec_session_id, time_t *result_expiration_time) { ReliSock rsock; rsock.timeout(60); if( ! rsock.connect(_addr) ) { dprintf(D_ALWAYS, "DCStarter::delegateX509Proxy: " "Failed to connect to starter %s\n", _addr); return XUS_Error; } CondorError errstack; if( ! startCommand(DELEGATE_GSI_CRED_STARTER, &rsock, 0, &errstack, NULL, false, sec_session_id) ) { dprintf( D_ALWAYS, "DCStarter::delegateX509Proxy: " "Failed send command to the starter: %s\n", errstack.getFullText().c_str()); return XUS_Error; } // Send the gsi proxy filesize_t file_size = 0; // will receive the size of the file if ( rsock.put_x509_delegation(&file_size,filename,expiration_time,result_expiration_time) < 0 ) { dprintf(D_ALWAYS, "DCStarter::delegateX509Proxy " "failed to delegate proxy file %s (size=%ld)\n", filename, (long int)file_size); return XUS_Error; } // Fetch the result rsock.decode(); int reply = 0; rsock.code(reply); rsock.end_of_message(); switch(reply) { case 0: return XUS_Error; case 1: return XUS_Okay; case 2: return XUS_Declined; } dprintf(D_ALWAYS, "DCStarter::delegateX509Proxy: " "remote side returned unknown code %d. Treating " "as an error.\n", reply); return XUS_Error; }
/** * Process the history directory and maintain the history file map * * Only handle rotated history files, those history.* that are not an * index. For each one that is not in the history file map, create a * new HistoryFile, poll it for entries to process, and add it to the * map. */ void aviary::history::processHistoryDirectory() { const char *file = NULL; // each time through we rebuild our set of inodes if (force_reset) { m_historyFiles.clear(); } Directory dir ( m_path.Value() ); dir.Rewind(); while ( ( file = dir.Next() ) ) { // Skip all non-history files, e.g. history and history.*.idx if ( strncmp ( file, "history.", 8 ) || !strncmp ( file + ( strlen ( file ) - 4 ), HISTORY_INDEX_SUFFIX, 4 ) ) continue; HistoryFile h_file ( ( m_path + DIR_DELIM_STRING + file ).Value() ); CondorError errstack; if ( !h_file.init ( errstack ) ) { dprintf ( D_ALWAYS, "%s\n", errstack.getFullText().c_str() ); return; } errstack.clear(); long unsigned int id; ASSERT ( h_file.getId ( id ) ); HistoryFileListType::iterator entry = m_historyFiles.find ( id ); if ( m_historyFiles.end() == entry ) { HistoryFile::HistoryEntriesTypeIterators ij = h_file.poll ( errstack ); for ( HistoryFile::HistoryEntriesTypeIterator i = ij.first; i != ij.second; i++ ) { process ( ( *i ) ); } m_historyFiles.insert ( id ); } } }
ODSHistoryFile & ODSHistoryFile::operator=(const ODSHistoryFile &base) { if (this != &base) { (*this).m_name = base.m_name; cleanup(); // Don't just copy the stat and FILE* members, initialize them CondorError errstack; if (!init(errstack)) { // XXX: Should throw an exception here dprintf ( D_ALWAYS, "ODSHistoryFile::operator=: %s\n", errstack.getFullText(true).c_str()); } } return *this; }
void handleAll() { char constraint[128]; sprintf( constraint, "%s >= 0", ATTR_CLUSTER_ID ); CondorError errstack; if( doWorkByConstraint(constraint, &errstack) ) { fprintf( stdout, "All jobs %s.\n", (mode == JA_REMOVE_JOBS) ? "marked for removal" : (mode == JA_REMOVE_X_JOBS) ? "removed locally (remote state unknown)" : actionWord(mode,true) ); } else { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if (had_error) { fprintf( stderr, "Could not %s all jobs.\n", actionWord(mode,false) ); } } }
bool DCStartd::getAds( ClassAdList &adsList ) { CondorError errstack; // fetch the query QueryResult q; CondorQuery* query; char* ad_addr; // instantiate query object if (!(query = new CondorQuery (STARTD_AD))) { dprintf( D_ALWAYS, "Error: Out of memory\n"); return(false); } if( this->locate() ){ ad_addr = this->addr(); q = query->fetchAds(adsList, ad_addr, &errstack); if (q != Q_OK) { if (q == Q_COMMUNICATION_ERROR) { dprintf( D_ALWAYS, "%s\n", errstack.getFullText(true).c_str() ); } else { dprintf (D_ALWAYS, "Error: Could not fetch ads --- %s\n", getStrQueryResult(q)); } delete query; return (false); } } else { delete query; return(false); } delete query; return(true); }
//--------------------------------------------------------------------------- bool Job::UnmonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader ) { debug_printf( DEBUG_DEBUG_2, "Unmonitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); if ( !_logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already unmonitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; debug_printf( DEBUG_DEBUG_1, "Unmonitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; bool result = logReader.unmonitorLogFile( GetLogFile(), errstack ); if ( !result ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to unmonitor log " "file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); EXCEPT( "Fatal log file monitoring error!\n" ); } if ( result ) { delete [] _logFile; _logFile = NULL; _logIsMonitored = false; } return result; }
void doContactSchedd() { int rc; Qmgr_connection *schedd; BaseJob *curr_job; ClassAd *next_ad; char expr_buf[12000]; bool schedd_updates_complete = false; bool schedd_deletes_complete = false; bool add_remove_jobs_complete = false; bool update_jobs_complete = false; bool commit_transaction = true; int failure_line_num = 0; bool send_reschedule = false; std::string error_str = ""; StringList dirty_job_ids; char *job_id_str; PROC_ID job_id; CondorError errstack; dprintf(D_FULLDEBUG,"in doContactSchedd()\n"); initJobExprs(); contactScheddTid = TIMER_UNSET; // vacateJobs ///////////////////////////////////////////////////// if ( pendingScheddVacates.getNumElements() != 0 ) { std::string buff; StringList job_ids; VacateRequest curr_request; int result; ClassAd* rval; pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "%d.%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); job_ids.append( buff.c_str() ); } char *tmp = job_ids.print_to_string(); if ( tmp ) { dprintf( D_FULLDEBUG, "Calling vacateJobs on %s\n", tmp ); free(tmp); tmp = NULL; } rval = ScheddObj->vacateJobs( &job_ids, VACATE_FAST, &errstack ); if ( rval == NULL ) { formatstr( error_str, "vacateJobs returned NULL, CondorError: %s!", errstack.getFullText().c_str() ); goto contact_schedd_failure; } else { pendingScheddVacates.startIterations(); while ( pendingScheddVacates.iterate( curr_request ) != 0 ) { formatstr( buff, "job_%d_%d", curr_request.job->procID.cluster, curr_request.job->procID.proc ); if ( !rval->LookupInteger( buff.c_str(), result ) ) { dprintf( D_FULLDEBUG, "vacateJobs returned malformed ad\n" ); EXCEPT( "vacateJobs returned malformed ad" ); } else { dprintf( D_FULLDEBUG, " %d.%d vacate result: %d\n", curr_request.job->procID.cluster, curr_request.job->procID.proc,result); pendingScheddVacates.remove( curr_request.job->procID ); curr_request.result = (action_result_t)result; curr_request.job->SetEvaluateState(); completedScheddVacates.insert( curr_request.job->procID, curr_request ); } } delete rval; } } schedd = ConnectQ( ScheddAddr, QMGMT_TIMEOUT, false, NULL, myUserName, CondorVersion() ); if ( !schedd ) { error_str = "Failed to connect to schedd!"; goto contact_schedd_failure; } // CheckLeases ///////////////////////////////////////////////////// if ( checkLeasesSignaled ) { dprintf( D_FULLDEBUG, "querying for renewed leases\n" ); // Grab the lease attributes of all the jobs in our global hashtable. BaseJob::JobsByProcId.startIterations(); while ( BaseJob::JobsByProcId.iterate( curr_job ) != 0 ) { int new_expiration; rc = GetAttributeInt( curr_job->procID.cluster, curr_job->procID.proc, ATTR_TIMER_REMOVE_CHECK, &new_expiration ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // This job doesn't have doesn't have a lease from // the submitter. Skip it. continue; } } curr_job->UpdateJobLeaseReceived( new_expiration ); } checkLeasesSignaled = false; } // end of handling check leases // AddJobs ///////////////////////////////////////////////////// if ( addJobsSignaled || firstScheddContact ) { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for new jobs\n" ); // Make sure we grab all Globus Universe jobs (except held ones // that we previously indicated we were done with) // when we first start up in case we're recovering from a // shutdown/meltdown. // Otherwise, grab all jobs that are unheld and aren't marked as // currently being managed and aren't marked as not matched. // If JobManaged is undefined, equate it with false. // If Matched is undefined, equate it with true. // NOTE: Schedds from Condor 6.6 and earlier don't include // "(Universe==9)" in the constraint they give to the gridmanager, // so this gridmanager will pull down non-globus-universe ads, // although it won't use them. This is inefficient but not // incorrect behavior. if ( firstScheddContact ) { // Grab all jobs for us to manage. This expression is a // derivative of the expression below for new jobs. We add // "|| Managed =?= TRUE" to also get jobs our previous // incarnation was in the middle of managing when it died // (if it died unexpectedly). With the new term, the // "&& Managed =!= TRUE" from the new jobs expression becomes // superfluous (by boolean logic), so we drop it. sprintf( expr_buf, "%s && %s && ((%s && %s) || %s)", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); } else { // Grab new jobs for us to manage sprintf( expr_buf, "%s && %s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_matched_or_undef.c_str(), expr_not_held.c_str(), expr_not_managed.c_str() ); } dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *old_job; int job_is_matched = 1; // default to true if not in ClassAd next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); bool job_is_managed = jobExternallyManaged(next_ad); next_ad->LookupBool(ATTR_JOB_MATCHED,job_is_matched); if ( BaseJob::JobsByProcId.lookup( procID, old_job ) != 0 ) { JobType *job_type = NULL; BaseJob *new_job = NULL; // job had better be either managed or matched! (or both) ASSERT( job_is_managed || job_is_matched ); if ( MustExpandJobAd( next_ad ) ) { // Get the expanded ClassAd from the schedd, which // has the GridResource filled in with info from // the matched ad. delete next_ad; next_ad = NULL; next_ad = GetJobAd(procID.cluster,procID.proc); if ( next_ad == NULL && errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } if ( next_ad == NULL ) { // We may get here if it was not possible to expand // one of the $$() expressions. We don't want to // roll back the transaction and blow away the // hold that the schedd just put on the job, so // simply skip over this ad. dprintf(D_ALWAYS,"Failed to get expanded job ClassAd from Schedd for %d.%d. errno=%d\n",procID.cluster,procID.proc,errno); goto contact_schedd_next_add_job; } } // Search our job types for one that'll handle this job jobTypes.Rewind(); while ( jobTypes.Next( job_type ) ) { if ( job_type->AdMatchFunc( next_ad ) ) { // Found one! dprintf( D_FULLDEBUG, "Using job type %s for job %d.%d\n", job_type->Name, procID.cluster, procID.proc ); break; } } if ( job_type != NULL ) { new_job = job_type->CreateFunc( next_ad ); } else { dprintf( D_ALWAYS, "No handlers for job %d.%d\n", procID.cluster, procID.proc ); new_job = new BaseJob( next_ad ); } ASSERT(new_job); new_job->SetEvaluateState(); dprintf(D_ALWAYS,"Found job %d.%d --- inserting\n", new_job->procID.cluster,new_job->procID.proc); num_ads++; if ( !job_is_managed ) { rc = tSetAttributeString( new_job->procID.cluster, new_job->procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } else { // We already know about this job, skip // But also set Managed=true on the schedd so that it won't // keep signalling us about it delete next_ad; rc = tSetAttributeString( procID.cluster, procID.proc, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); if ( rc < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } contact_schedd_next_add_job: next_ad = GetNextJobByConstraint( expr_buf, 0 ); } // end of while next_ad if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d new job ads from schedd\n",num_ads); } // end of handling add jobs // RemoveJobs ///////////////////////////////////////////////////// // We always want to perform this check. Otherwise, we may overwrite a // REMOVED/HELD/COMPLETED status with something else below. { int num_ads = 0; dprintf( D_FULLDEBUG, "querying for removed/held jobs\n" ); // Grab jobs marked as REMOVED/COMPLETED or marked as HELD that we // haven't previously indicated that we're done with (by setting // JobManaged to "Schedd". sprintf( expr_buf, "(%s) && (%s) && (%s == %d || %s == %d || (%s == %d && %s =?= \"%s\"))", ScheddJobConstraint, expr_not_completely_done.c_str(), ATTR_JOB_STATUS, REMOVED, ATTR_JOB_STATUS, COMPLETED, ATTR_JOB_STATUS, HELD, ATTR_JOB_MANAGED, MANAGED_EXTERNAL ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { PROC_ID procID; BaseJob *next_job; int curr_status; next_ad->LookupInteger( ATTR_CLUSTER_ID, procID.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, procID.proc ); next_ad->LookupInteger( ATTR_JOB_STATUS, curr_status ); if ( BaseJob::JobsByProcId.lookup( procID, next_job ) == 0 ) { // Should probably skip jobs we already have marked as // held or removed next_job->JobAdUpdateFromSchedd( next_ad, true ); num_ads++; } else if ( curr_status == REMOVED ) { // If we don't know about the job, act like we got an // ADD_JOBS signal from the schedd the next time we // connect, so that we'll create a Job object for it // and decide how it needs to be handled. // TODO The AddJobs and RemoveJobs queries shoule be // combined into a single query. dprintf( D_ALWAYS, "Don't know about removed job %d.%d. " "Will treat it as a new job to manage\n", procID.cluster, procID.proc ); addJobsSignaled = true; } else { dprintf( D_ALWAYS, "Don't know about held/completed job %d.%d. " "Ignoring it\n", procID.cluster, procID.proc ); } delete next_ad; next_ad = GetNextJobByConstraint( expr_buf, 0 ); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } dprintf(D_FULLDEBUG,"Fetched %d job ads from schedd\n",num_ads); } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } add_remove_jobs_complete = true; // Retrieve dirty attributes ///////////////////////////////////////////////////// if ( updateJobsSignaled ) { dprintf( D_FULLDEBUG, "querying for jobs with attribute updates\n" ); sprintf( expr_buf, "%s && %s && %s && %s", expr_schedd_job_constraint.c_str(), expr_not_completely_done.c_str(), expr_not_held.c_str(), expr_managed.c_str() ); dprintf( D_FULLDEBUG,"Using constraint %s\n",expr_buf); next_ad = GetNextDirtyJobByConstraint( expr_buf, 1 ); while ( next_ad != NULL ) { ClassAd updates; char str[PROC_ID_STR_BUFLEN]; next_ad->LookupInteger( ATTR_CLUSTER_ID, job_id.cluster ); next_ad->LookupInteger( ATTR_PROC_ID, job_id.proc ); if ( GetDirtyAttributes( job_id.cluster, job_id.proc, &updates ) < 0 ) { dprintf( D_ALWAYS, "Failed to retrieve dirty attributes for job %d.%d\n", job_id.cluster, job_id.proc ); failure_line_num = __LINE__; delete next_ad; goto contact_schedd_disconnect; } else { dprintf (D_FULLDEBUG, "Retrieved updated attributes for job %d.%d\n", job_id.cluster, job_id.proc); dPrintAd(D_JOB, updates); } if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->JobAdUpdateFromSchedd( &updates, false ); ProcIdToStr( job_id, str ); dirty_job_ids.append( str ); } else { dprintf( D_ALWAYS, "Don't know about updated job %d.%d. " "Ignoring it\n", job_id.cluster, job_id.proc ); } delete next_ad; next_ad = GetNextDirtyJobByConstraint( expr_buf, 0 ); } } update_jobs_complete = true; // if ( BeginTransaction() < 0 ) { errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } // requestJobStatus ///////////////////////////////////////////////////// if ( pendingJobStatus.getNumElements() != 0 ) { JobStatusRequest curr_request; pendingJobStatus.startIterations(); while ( pendingJobStatus.iterate( curr_request ) != 0 ) { int status; rc = GetAttributeInt( curr_request.job_id.cluster, curr_request.job_id.proc, ATTR_JOB_STATUS, &status ); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so return a job status of REMOVED. status = REMOVED; } } // return status dprintf( D_FULLDEBUG, "%d.%d job status: %d\n", curr_request.job_id.cluster, curr_request.job_id.proc, status ); pendingJobStatus.remove( curr_request.job_id ); curr_request.job_status = status; daemonCore->Reset_Timer( curr_request.tid, 0 ); completedJobStatus.insert( curr_request.job_id, curr_request ); } } // Update existing jobs ///////////////////////////////////////////////////// ScheddUpdateRequest *curr_request; pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; dprintf(D_FULLDEBUG,"Updating classad values for %d.%d:\n", curr_job->procID.cluster, curr_job->procID.proc); const char *attr_name; const char *attr_value; ExprTree *expr; bool fake_job_in_queue = false; curr_job->jobAd->ResetExpr(); while ( curr_job->jobAd->NextDirtyExpr(attr_name, expr) == true && fake_job_in_queue == false ) { attr_value = ExprTreeToString( expr ); dprintf(D_FULLDEBUG," %s = %s\n",attr_name,attr_value); rc = SetAttribute( curr_job->procID.cluster, curr_job->procID.proc, attr_name, attr_value); if ( rc < 0 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } else { // The job is not in the schedd's job queue. This // probably means that the user did a condor_rm -f, // so pretend that all updates for the job succeed. // Otherwise, we'll never make forward progress on // the job. // TODO We should also fake a job status of REMOVED // to the job, so it can do what cleanup it can. fake_job_in_queue = true; break; } } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_updates_complete = true; // Delete existing jobs ///////////////////////////////////////////////////// errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; if ( curr_job->deleteFromSchedd ) { dprintf(D_FULLDEBUG,"Deleting job %d.%d from schedd\n", curr_job->procID.cluster, curr_job->procID.proc); rc = DestroyProc(curr_job->procID.cluster, curr_job->procID.proc); // NOENT means the job doesn't exist. Good enough for us. if ( rc < 0 && rc != DESTROYPROC_ENOENT) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } } } if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; commit_transaction = false; goto contact_schedd_disconnect; } schedd_deletes_complete = true; contact_schedd_disconnect: DisconnectQ( schedd, commit_transaction ); if ( add_remove_jobs_complete == true ) { firstScheddContact = false; addJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during Add/RemoveJobs at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( update_jobs_complete == true ) { updateJobsSignaled = false; } else { formatstr( error_str, "Schedd connection error during dirty attribute update at line %d!", failure_line_num ); goto contact_schedd_failure; } if ( schedd_updates_complete == false ) { formatstr( error_str, "Schedd connection error during updates at line %d!", failure_line_num ); goto contact_schedd_failure; } // Clear dirty bits for all jobs updated if ( !dirty_job_ids.isEmpty() ) { ClassAd *rval; dprintf( D_FULLDEBUG, "Calling clearDirtyAttrs on %d jobs\n", dirty_job_ids.number() ); dirty_job_ids.rewind(); rval = ScheddObj->clearDirtyAttrs( &dirty_job_ids, &errstack ); if ( rval == NULL ) { dprintf(D_ALWAYS, "Failed to notify schedd to clear dirty attributes. CondorError: %s\n", errstack.getFullText().c_str() ); } delete rval; } // Wake up jobs that had schedd updates pending and delete job // objects that wanted to be deleted pendingScheddUpdates.startIterations(); while ( pendingScheddUpdates.iterate( curr_request ) != 0 ) { curr_job = curr_request->m_job; curr_job->jobAd->ClearAllDirtyFlags(); if ( curr_job->deleteFromGridmanager ) { // If the Job object wants to delete the job from the // schedd but we failed to do so, don't delete the job // object yet; wait until we successfully delete the job // from the schedd. if ( curr_job->deleteFromSchedd == true && schedd_deletes_complete == false ) { continue; } // If wantRematch is set, send a reschedule now if ( curr_job->wantRematch ) { send_reschedule = true; } pendingScheddUpdates.remove( curr_job->procID ); pendingScheddVacates.remove( curr_job->procID ); pendingJobStatus.remove( curr_job->procID ); completedJobStatus.remove( curr_job->procID ); completedScheddVacates.remove( curr_job->procID ); delete curr_job; } else { pendingScheddUpdates.remove( curr_job->procID ); if ( curr_request->m_notify ) { curr_job->SetEvaluateState(); } } delete curr_request; } // Poke objects that wanted to be notified when a schedd update completed // successfully (possibly minus deletes) int timer_id; scheddUpdateNotifications.Rewind(); while ( scheddUpdateNotifications.Next( timer_id ) ) { daemonCore->Reset_Timer( timer_id, 0 ); } scheddUpdateNotifications.Clear(); if ( send_reschedule == true ) { ScheddObj->reschedule(); } // Check if we have any jobs left to manage. If not, exit. if ( BaseJob::JobsByProcId.getNumElements() == 0 ) { dprintf( D_ALWAYS, "No jobs left, shutting down\n" ); daemonCore->Send_Signal( daemonCore->getpid(), SIGTERM ); } lastContactSchedd = time(NULL); if ( schedd_deletes_complete == false ) { error_str = "Problem using DestroyProc to delete jobs!"; goto contact_schedd_failure; } scheddFailureCount = 0; // For each job that had dirty attributes, re-evaluate the policy dirty_job_ids.rewind(); while ( (job_id_str = dirty_job_ids.next()) != NULL ) { StrToProcIdFixMe(job_id_str, job_id); if ( BaseJob::JobsByProcId.lookup( job_id, curr_job ) == 0 ) { curr_job->EvalPeriodicJobExpr(); } } dprintf(D_FULLDEBUG,"leaving doContactSchedd()\n"); return; contact_schedd_failure: scheddFailureCount++; if ( error_str == "" ) { error_str = "Failure in doContactSchedd"; } if ( scheddFailureCount >= maxScheddFailures ) { dprintf( D_ALWAYS, "%s\n", error_str.c_str() ); EXCEPT( "Too many failures connecting to schedd!" ); } dprintf( D_ALWAYS, "%s Will retry\n", error_str.c_str() ); lastContactSchedd = time(NULL); RequestContactSchedd(); return; }
bool DCTransferQueue::RequestTransferQueueSlot(bool downloading,char const *fname,char const *jobid,int timeout,MyString &error_desc) { ASSERT(fname); ASSERT(jobid); if( GoAheadAlways( downloading ) ) { m_xfer_downloading = downloading; m_xfer_fname = fname; m_xfer_jobid = jobid; return true; } CheckTransferQueueSlot(); if( m_xfer_queue_sock ) { // A request has already been made. // Currently, this is a no-op, because any upload/download slot // is as good as any other. In the future, there may be // different queues for different paths. ASSERT( m_xfer_downloading == downloading ); m_xfer_fname = fname; m_xfer_jobid = jobid; return true; } time_t started = time(NULL); CondorError errstack; // Our caller has to finish this operation in the specified // amount of time or risk not responding to the file transfer // peer in time, so ignore the timeout multiplier and set the // timeout exactly as specified. m_xfer_queue_sock = reliSock( timeout, 0, &errstack, false, true ); if( !m_xfer_queue_sock ) { formatstr(m_xfer_rejected_reason, "Failed to connect to transfer queue manager for job %s (%s): %s.", jobid, fname, errstack.getFullText().c_str() ); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } if( timeout ) { timeout -= time(NULL)-started; if( timeout <= 0 ) { timeout = 1; } } bool connected = startCommand( TRANSFER_QUEUE_REQUEST, m_xfer_queue_sock, timeout, &errstack ); if( !connected ) { delete m_xfer_queue_sock; m_xfer_queue_sock = NULL; formatstr(m_xfer_rejected_reason, "Failed to initiate transfer queue request for job %s (%s): %s.", jobid, fname, errstack.getFullText().c_str() ); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } m_xfer_downloading = downloading; m_xfer_fname = fname; m_xfer_jobid = jobid; ClassAd msg; msg.Assign(ATTR_DOWNLOADING,downloading); msg.Assign(ATTR_FILE_NAME,fname); msg.Assign(ATTR_JOB_ID,jobid); m_xfer_queue_sock->encode(); if( !msg.put(*m_xfer_queue_sock) || !m_xfer_queue_sock->end_of_message() ) { formatstr(m_xfer_rejected_reason, "Failed to write transfer request to %s for job %s " "(initial file %s).", m_xfer_queue_sock->peer_description(), m_xfer_jobid.c_str(), m_xfer_fname.c_str()); error_desc = m_xfer_rejected_reason; dprintf(D_ALWAYS,"%s\n",m_xfer_rejected_reason.c_str()); return false; } m_xfer_queue_sock->decode(); // Request has been initiated. Now sender should call // PollForTransferQueueSlot() to get response. m_xfer_queue_pending = true; return true; }
int do_Q_request(ReliSock *syscall_sock,bool &may_fork) { int request_num = -1; int rval; syscall_sock->decode(); assert( syscall_sock->code(request_num) ); dprintf(D_SYSCALLS, "Got request #%d\n", request_num); switch( request_num ) { case CONDOR_InitializeConnection: { // dprintf( D_ALWAYS, "InitializeConnection()\n" ); bool authenticated = true; // Authenticate socket, if not already done by daemonCore if( !syscall_sock->triedAuthentication() ) { if( IsDebugLevel(D_SECURITY) ) { MyString methods; SecMan::getAuthenticationMethods( WRITE, &methods ); dprintf(D_SECURITY,"Calling authenticate(%s) in qmgmt_receivers\n", methods.Value()); } CondorError errstack; if( ! SecMan::authenticate_sock(syscall_sock, WRITE, &errstack) ) { // Failed to authenticate dprintf( D_ALWAYS, "SCHEDD: authentication failed: %s\n", errstack.getFullText().c_str() ); authenticated = false; } } if ( authenticated ) { InitializeConnection( syscall_sock->getOwner(), syscall_sock->getDomain() ); } else { InitializeConnection( NULL, NULL ); } return 0; } case CONDOR_InitializeReadOnlyConnection: { // dprintf( D_ALWAYS, "InitializeReadOnlyConnection()\n" ); // Since InitializeConnection() does nothing, and we need // to record the fact that this is a read-only connection, // but we have to do it in the socket (since we don't have // any other persistent data structure, and it's probably // the right place anyway), set the FQU. // // We need to record if this is a read-only connection so that // we can avoid expanding $$ in GetJobAd; simply checking if the // connection is authenticated isn't sufficient, because the // security session cache means that read-only connection could // be authenticated by a previous authenticated connection from // the same address (when using host-based security) less than // the expiration period ago. syscall_sock->setFullyQualifiedUser( "read-only" ); // same as InitializeConnection but no authenticate() InitializeConnection( NULL, NULL ); may_fork = true; return 0; } case CONDOR_SetEffectiveOwner: { MyString owner; int terrno; assert( syscall_sock->get(owner) ); assert( syscall_sock->end_of_message() ); rval = QmgmtSetEffectiveOwner( owner.Value() ); terrno = errno; syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() ); char const *fqu = syscall_sock->getFullyQualifiedUser(); dprintf(D_SYSCALLS, "\tSetEffectiveOwner\n"); dprintf(D_SYSCALLS, "\tauthenticated user = '******'\n", fqu ? fqu : ""); dprintf(D_SYSCALLS, "\trequested owner = '%s'\n", owner.Value()); dprintf(D_SYSCALLS, "\trval %d, errno %d\n", rval, terrno); return 0; } case CONDOR_NewCluster: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = NewCluster( ); terrno = errno; dprintf(D_SYSCALLS, "\tNewCluster: rval = %d, errno = %d\n",rval,terrno ); if ( rval > 0 ) { dprintf( D_AUDIT, *syscall_sock, "Submitting new job %d.0\n", rval ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: NewCluster rval %d errno %d\n",rval,terrno); return 0; } case CONDOR_NewProc: { int cluster_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = NewProc( cluster_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); if ( rval > 0 ) { dprintf( D_AUDIT, *syscall_sock, "Submitting new job %d.%d\n", cluster_id, rval ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: NewProc rval %d errno %d\n",rval,terrno); return 0; } case CONDOR_DestroyProc: { int cluster_id = -1; int proc_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyProc( cluster_id, proc_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; dprintf(D_FULLDEBUG,"schedd: DestroyProc cluster %d proc %d rval %d errno %d\n",cluster_id,proc_id,rval,terrno); return 0; } case CONDOR_DestroyCluster: { int cluster_id = -1; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyCluster( cluster_id ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } #if 0 case CONDOR_DestroyClusterByConstraint: { char *constraint=NULL; int terrno; assert( syscall_sock->code(constraint) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DestroyClusterByConstraint( constraint ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } #endif case CONDOR_SetAttributeByConstraint: case CONDOR_SetAttributeByConstraint2: { char *attr_name=NULL; char *attr_value=NULL; char *constraint=NULL; int terrno; SetAttributeFlags_t flags = 0; assert( syscall_sock->code(constraint) ); dprintf( D_SYSCALLS, " constraint = %s\n",constraint); assert( syscall_sock->code(attr_value) ); assert( syscall_sock->code(attr_name) ); if( request_num == CONDOR_SetAttributeByConstraint2 ) { assert( syscall_sock->code( flags ) ); } assert( syscall_sock->end_of_message() );; if (strcmp (attr_name, ATTR_MYPROXY_PASSWORD) == 0) { errno = 0; dprintf( D_SYSCALLS, "SetAttributeByConstraint (MyProxyPassword) not supported...\n"); rval = 0; terrno = errno; } else { errno = 0; rval = SetAttributeByConstraint( constraint, attr_name, attr_value, flags ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); if ( rval == 0 ) { dprintf( D_AUDIT, *syscall_sock, "Set Attribute By Constraint %s, " "%s = %s\n", constraint, attr_name, attr_value); } } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)constraint ); free( (char *)attr_value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_SetAttribute: case CONDOR_SetAttribute2: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; char *attr_value=NULL; int terrno; SetAttributeFlags_t flags = 0; const char *users_username; const char *condor_username; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_value) ); assert( syscall_sock->code(attr_name) ); if( request_num == CONDOR_SetAttribute2 ) { assert( syscall_sock->code( flags ) ); } users_username = syscall_sock->getOwner(); condor_username = get_condor_username(); if (attr_name) dprintf(D_SYSCALLS,"\tattr_name = %s\n",attr_name); if (attr_value) dprintf(D_SYSCALLS,"\tattr_value = %s\n",attr_value); assert( syscall_sock->end_of_message() );; // ckireyev: // We do NOT want to include MyProxy password in the ClassAd (since it's a secret) // I'm not sure if this is the best place to do this, but.... if (attr_name && attr_value && strcmp (attr_name, ATTR_MYPROXY_PASSWORD) == 0) { errno = 0; dprintf( D_SYSCALLS, "Got MyProxyPassword, stashing...\n"); rval = SetMyProxyPassword (cluster_id, proc_id, attr_value); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); } else { errno = 0; rval = SetAttribute( cluster_id, proc_id, attr_name, attr_value, flags ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); // If we're modifying a previously-submitted job AND either // the client's username is not HTCondor's (i.e. not a // daemon) OR the client says we should log... if( (cluster_id != active_cluster_num) && (rval == 0) && ( strcmp(users_username, condor_username) || (flags & SHOULDLOG) ) ) { dprintf( D_AUDIT, *syscall_sock, "Set Attribute for job %d.%d, " "%s = %s\n", cluster_id, proc_id, attr_name, attr_value); } } free( (char *)attr_value ); free( (char *)attr_name ); if( flags & SetAttribute_NoAck ) { if( rval < 0 ) { return -1; } } else { syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() ); } return 0; } case CONDOR_SetTimerAttribute: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int duration = 0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); if (attr_name) dprintf(D_SYSCALLS,"\tattr_name = %s\n",attr_name); assert( syscall_sock->code(duration) ); dprintf(D_SYSCALLS,"\tduration = %d\n",duration); assert( syscall_sock->end_of_message() );; errno = 0; rval = SetTimerAttribute( cluster_id, proc_id, attr_name, duration ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); dprintf( D_AUDIT, *syscall_sock, "Set Timer Attribute for job %d.%d, " "attr_name = %s, duration = %d\n", cluster_id, proc_id, attr_name, duration); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_BeginTransaction: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = 0; // BeginTransaction returns void (sigh), so always success BeginTransaction( ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_AbortTransaction: { int terrno; assert( syscall_sock->end_of_message() );; errno = 0; rval = 0; // AbortTransaction returns void (sigh), so always success AbortTransaction( ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_CommitTransactionNoFlags: case CONDOR_CommitTransaction: { int terrno; int flags; if( request_num == CONDOR_CommitTransaction ) { assert( syscall_sock->code(flags) ); } else { flags = 0; } assert( syscall_sock->end_of_message() );; errno = 0; CondorError errstack; rval = CheckTransaction( flags, & errstack ); terrno = errno; dprintf( D_SYSCALLS, "\tflags = %d, rval = %d, errno = %d\n", flags, rval, terrno ); if( rval >= 0 ) { errno = 0; CommitTransaction( flags ); // CommitTransaction() never returns on failure rval = 0; terrno = errno; dprintf( D_SYSCALLS, "\tflags = %d, rval = %d, errno = %d\n", flags, rval, terrno ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); const CondorVersionInfo *vers = syscall_sock->get_peer_version(); if (vers && vers->built_since_version(8, 3, 4)) { // Send a classad, for less backwards-incompatibility. int code = 1; const char * reason = "QMGMT rejected job submission."; if( errstack.subsys() ) { code = 2; reason = errstack.message(); } ClassAd reply; reply.Assign( "ErrorCode", code ); reply.Assign( "ErrorReason", reason ); assert( putClassAd( syscall_sock, reply ) ); } } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeFloat: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; float value = 0.0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeFloat( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeInt: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int value = 0; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); dprintf( D_SYSCALLS, " attr_name = %s\n", attr_name ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeInt( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; if (rval < 0) { dprintf( D_SYSCALLS, "GetAttributeInt(%d, %d, %s) not found.\n", cluster_id, proc_id, attr_name); } else { dprintf( D_SYSCALLS, " value: %d\n", value ); dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); } syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeString: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; char *value = NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeStringNew( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( syscall_sock->code(value) ); } free( (char *)value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetAttributeExpr: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; char *value = NULL; errno = 0; if( QmgmtMayAccessAttribute( attr_name ) ) { rval = GetAttributeExprNew( cluster_id, proc_id, attr_name, &value ); } else { rval = -1; } terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); if ( !syscall_sock->code(rval) ) { free(value); return -1; } if( rval < 0 ) { if ( !syscall_sock->code(terrno) ) { free(value); return -1; } } if( rval >= 0 ) { if ( !syscall_sock->code(value) ) { free(value); return -1; } } free( (char *)value ); free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetDirtyAttributes: { int cluster_id = -1; int proc_id = -1; ClassAd updates; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; errno = 0; rval = GetDirtyAttributes( cluster_id, proc_id, &updates ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); if ( !syscall_sock->code(rval) ) { return -1; } if( rval < 0 ) { if ( !syscall_sock->code(terrno) ) { return -1; } } if( rval >= 0 ) { assert( putClassAd(syscall_sock, updates) ); } assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_DeleteAttribute: { int cluster_id = -1; int proc_id = -1; char *attr_name=NULL; int terrno; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->code(attr_name) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = DeleteAttribute( cluster_id, proc_id, attr_name ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } free( (char *)attr_name ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetJobAd: { int cluster_id = -1; int proc_id = -1; ClassAd *ad = NULL; int terrno; bool delete_ad = false; assert( syscall_sock->code(cluster_id) ); dprintf( D_SYSCALLS, " cluster_id = %d\n", cluster_id ); assert( syscall_sock->code(proc_id) ); dprintf( D_SYSCALLS, " proc_id = %d\n", proc_id ); assert( syscall_sock->end_of_message() );; // dprintf( D_ALWAYS, "(%d.%d) isAuthenticated() = %d\n", cluster_id, proc_id, syscall_sock->isAuthenticated() ); // dprintf( D_ALWAYS, "(%d.%d) getOwner() = %s\n", cluster_id, proc_id, syscall_sock->getOwner() ); errno = 0; // Only fetch the jobad for legal values of cluster/proc if( cluster_id >= 1 ) { if( proc_id >= 0 ) { const char * fqu = syscall_sock->getFullyQualifiedUser(); if( fqu != NULL && strcmp( fqu, "read-only" ) != 0 ) { // expand $$() macros in the jobad as required by GridManager. // The GridManager depends on the fact that the following call // expands $$ and saves the expansions to disk in case of // restart. ad = GetJobAd_as_ClassAd( cluster_id, proc_id, true, true ); delete_ad = true; // note : since we expanded the ad, ad is now a deep // copy of the ad in memory, so we must delete it below. } else { ad = GetJobAd_as_ClassAd( cluster_id, proc_id, false, false ); } } else if( proc_id == -1 ) { // allow cluster ad to be queried as required by preen, but // do NOT ask to expand $$() macros in a cluster ad! ad = GetJobAd_as_ClassAd( cluster_id, proc_id, false, false ); } } terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } // If we called GetJobAd() with the third bool argument set // to True (expandedAd), it does a deep copy of the ad in the // queue in order to expand the $$() attributes. So we must // delete it. if (delete_ad) delete ad; assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetJobByConstraint: { char *constraint=NULL; ClassAd *ad; int terrno; assert( syscall_sock->code(constraint) ); assert( syscall_sock->end_of_message() );; errno = 0; ad = GetJobByConstraint_as_ClassAd( constraint ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextJob: { ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); assert( syscall_sock->end_of_message() );; errno = 0; ad = GetNextJob( initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextJobByConstraint: { char *constraint=NULL; ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } assert( syscall_sock->end_of_message() );; errno = 0; ad = GetNextJobByConstraint( constraint, initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() );; return 0; } case CONDOR_GetNextDirtyJobByConstraint: { char *constraint=NULL; ClassAd *ad; int initScan = 0; int terrno; assert( syscall_sock->code(initScan) ); dprintf( D_SYSCALLS, " initScan = %d\n", initScan ); if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } assert( syscall_sock->end_of_message() ); errno = 0; ad = GetNextDirtyJobByConstraint( constraint, initScan ); terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE) ); } FreeJobAd(ad); free( (char *)constraint ); assert( syscall_sock->end_of_message() ); return 0; } case CONDOR_SendSpoolFile: { char *filename=NULL; int terrno; assert( syscall_sock->code(filename) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = SendSpoolFile( filename ); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); #if 0 syscall_sock->encode(); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } assert( syscall_sock->end_of_message() );; #endif free( (char *)filename ); return 0; } case CONDOR_SendSpoolFileIfNeeded: { int terrno; ClassAd ad; assert( getClassAd(syscall_sock, ad) ); assert( syscall_sock->end_of_message() );; errno = 0; rval = SendSpoolFileIfNeeded(ad); terrno = errno; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); return 0; } case CONDOR_GetAllJobsByConstraint: { char *constraint=NULL; char *projection=NULL; ClassAd *ad; int terrno; int initScan = 1; classad::References proj; if ( !(syscall_sock->code(constraint)) ) { if (constraint != NULL) { free(constraint); constraint = NULL; } return -1; } if ( !(syscall_sock->code(projection)) ) { if (projection != NULL) { free(constraint); free(projection); projection = NULL; } return -1; } dprintf( D_SYSCALLS, " constraint = %s\n", constraint ); dprintf( D_SYSCALLS, " projection = %s\n", projection ? projection : ""); assert( syscall_sock->end_of_message() );; // if there is a projection, convert it into a set of attribute names if (projection) { StringTokenIterator list(projection); const std::string * attr; while ((attr = list.next_string())) { proj.insert(*attr); } } syscall_sock->encode(); do { errno = 0; ad = GetNextJobByConstraint( constraint, initScan ); initScan=0; // one first time through, otherwise 0 terrno = errno; rval = ad ? 0 : -1; dprintf( D_SYSCALLS, "\trval = %d, errno = %d\n", rval, terrno ); assert( syscall_sock->code(rval) ); if( rval < 0 ) { assert( syscall_sock->code(terrno) ); } if( rval >= 0 ) { assert( putClassAd(syscall_sock, *ad, PUT_CLASSAD_NO_PRIVATE, proj.empty() ? NULL : &proj) ); FreeJobAd(ad); } } while (rval >= 0); assert( syscall_sock->end_of_message() );; free( (char *)constraint ); free( (char *)projection ); return 0; } case CONDOR_CloseSocket: { assert( syscall_sock->end_of_message() );; return -1; } } /* End of switch */ return -1; } /* End of function */
int main(int argc, char **argv) { int result = 0; if ( argc <= 1 || (argc >= 2 && !strcmp("-usage", argv[1])) ) { printf("Usage: condor_check_userlogs <log file 1> " "[log file 2] ... [log file n]\n"); exit(0); } // Set up dprintf. dprintf_set_tool_debug("condor_check_userlogs", 0); set_debug_flags(NULL, D_ALWAYS); StringList logFiles; for ( int argnum = 1; argnum < argc; ++argnum ) { logFiles.append(argv[argnum]); } logFiles.rewind(); ReadMultipleUserLogs ru; char *filename; while ( (filename = logFiles.next()) ) { MyString filestring( filename ); CondorError errstack; if ( !ru.monitorLogFile( filestring, false, errstack ) ) { fprintf( stderr, "Error monitoring log file %s: %s\n", filename, errstack.getFullText().c_str() ); result = 1; } } bool logsMissing = false; CheckEvents ce; int totalSubmitted = 0; int netSubmitted = 0; bool done = false; while( !done ) { ULogEvent* e = NULL; MyString errorMsg; ULogEventOutcome outcome = ru.readEvent( e ); switch (outcome) { case ULOG_RD_ERROR: case ULOG_UNK_ERROR: logsMissing = true; case ULOG_NO_EVENT: printf( "Log outcome: %s\n", ULogEventOutcomeNames[outcome] ); done = true; break; case ULOG_OK: printf( "Log event: %s (%d.%d.%d)", ULogEventNumberNames[e->eventNumber], e->cluster, e->proc, e->subproc ); if ( ce.CheckAnEvent(e, errorMsg) != CheckEvents::EVENT_OKAY ) { fprintf(stderr, "%s\n", errorMsg.Value()); result = 1; } if( e->eventNumber == ULOG_SUBMIT ) { SubmitEvent* ee = (SubmitEvent*) e; printf( " (\"%s\")", ee->submitEventLogNotes ); ++totalSubmitted; ++netSubmitted; printf( "\n Total submitted: %d; net submitted: %d\n", totalSubmitted, netSubmitted ); } if( e->eventNumber == ULOG_JOB_HELD ) { JobHeldEvent* ee = (JobHeldEvent*) e; printf( " (code=%d subcode=%d)", ee->getReasonCode(), ee->getReasonSubCode()); } if( e->eventNumber == ULOG_JOB_TERMINATED ) { --netSubmitted; printf( "\n Total submitted: %d; net submitted: %d\n", totalSubmitted, netSubmitted ); } if( e->eventNumber == ULOG_JOB_ABORTED ) { --netSubmitted; printf( "\n Total submitted: %d; net submitted: %d\n", totalSubmitted, netSubmitted ); } if( e->eventNumber == ULOG_EXECUTABLE_ERROR ) { --netSubmitted; printf( "\n Total submitted: %d; net submitted: %d\n", totalSubmitted, netSubmitted ); } printf( "\n" ); break; default: fprintf(stderr, "Unexpected read event outcome!\n"); result = 1; break; } } logFiles.rewind(); while ( (filename = logFiles.next()) ) { MyString filestring( filename ); CondorError errstack; if ( !ru.unmonitorLogFile( filestring, errstack ) ) { fprintf( stderr, "Error unmonitoring log file %s: %s\n", filename, errstack.getFullText().c_str() ); result = 1; } } MyString errorMsg; CheckEvents::check_event_result_t checkAllResult = ce.CheckAllJobs(errorMsg); if ( checkAllResult != CheckEvents::EVENT_OKAY ) { fprintf(stderr, "%s\n", errorMsg.Value()); fprintf(stderr, "CheckAllJobs() result: %s\n", CheckEvents::ResultToString(checkAllResult)); result = 1; } if ( result == 0 ) { if ( !logsMissing ) { printf("Log(s) are okay\n"); } else { printf("Log(s) may be okay\n"); printf( "Some logs cannot be read\n"); } } else { printf("Log(s) have error(s)\n"); } return result; }
/////////////////////////////////////////////////////////////////////////////// // Note: this method should get speeded up (see Gnats PR 846). MyString MultiLogFiles::loadLogFileNameFromSubFile(const MyString &strSubFilename, const MyString &directory, bool &isXml, bool usingDefaultNode) { dprintf( D_FULLDEBUG, "MultiLogFiles::loadLogFileNameFromSubFile(%s, %s)\n", strSubFilename.Value(), directory.Value() ); TmpDir td; if ( directory != "" ) { MyString errMsg; if ( !td.Cd2TmpDir(directory.Value(), errMsg) ) { dprintf(D_ALWAYS, "Error from Cd2TmpDir: %s\n", errMsg.Value()); return ""; } } StringList logicalLines; if ( fileNameToLogicalLines( strSubFilename, logicalLines ) != "" ) { return ""; } MyString logFileName(""); MyString initialDir(""); MyString isXmlLogStr(""); // Now look through the submit file logical lines to find the // log file and initial directory (if specified) and combine // them into a path to the log file that's either absolute or // relative to the DAG submit directory. Also look for log_xml. const char *logicalLine; while( (logicalLine = logicalLines.next()) != NULL ) { MyString submitLine(logicalLine); MyString tmpLogName = getParamFromSubmitLine(submitLine, "log"); if ( tmpLogName != "" ) { logFileName = tmpLogName; } // If we are using the default node log, we don't care // about these if( !usingDefaultNode ) { MyString tmpInitialDir = getParamFromSubmitLine(submitLine, "initialdir"); if ( tmpInitialDir != "" ) { initialDir = tmpInitialDir; } MyString tmpLogXml = getParamFromSubmitLine(submitLine, "log_xml"); if ( tmpLogXml != "" ) { isXmlLogStr = tmpLogXml; } } } if ( !usingDefaultNode ) { // // Check for macros in the log file name -- we currently don't // handle those. // // If we are using the default node, we don't need to check this if ( logFileName != "" ) { if ( strstr(logFileName.Value(), "$(") ) { dprintf(D_ALWAYS, "MultiLogFiles: macros ('$(...') not allowed " "in log file name (%s) in DAG node submit files\n", logFileName.Value()); logFileName = ""; } } // Do not need to prepend initialdir if we are using the // default node log if ( logFileName != "" ) { // Prepend initialdir to log file name if log file name is not // an absolute path. if ( initialDir != "" && !fullpath(logFileName.Value()) ) { logFileName = initialDir + DIR_DELIM_STRING + logFileName; } // We do this in case the same log file is specified with a // relative and an absolute path. // Note: we now do further checking that doesn't rely on // comparing paths to the log files. wenger 2004-05-27. CondorError errstack; if ( !makePathAbsolute( logFileName, errstack ) ) { dprintf(D_ALWAYS, "%s\n", errstack.getFullText().c_str()); return ""; } } isXmlLogStr.lower_case(); isXml = (isXmlLogStr == "true"); if ( directory != "" ) { MyString errMsg; if ( !td.Cd2MainDir(errMsg) ) { dprintf(D_ALWAYS, "Error from Cd2MainDir: %s\n", errMsg.Value()); return ""; } } } return logFileName; }
bool DCSchedd::recycleShadow( int previous_job_exit_reason, ClassAd **new_job_ad, MyString &error_msg ) { int timeout = 300; CondorError errstack; ReliSock sock; if( !connectSock(&sock,timeout,&errstack) ) { error_msg.formatstr("Failed to connect to schedd: %s", errstack.getFullText().c_str()); return false; } if( !startCommand(RECYCLE_SHADOW, &sock, timeout, &errstack) ) { error_msg.formatstr("Failed to send RECYCLE_SHADOW to schedd: %s", errstack.getFullText().c_str()); return false; } if( !forceAuthentication(&sock, &errstack) ) { error_msg.formatstr("Failed to authenticate: %s", errstack.getFullText().c_str()); return false; } sock.encode(); int mypid = getpid(); if( !sock.put( mypid ) || !sock.put( previous_job_exit_reason ) || !sock.end_of_message() ) { error_msg = "Failed to send job exit reason"; return false; } sock.decode(); int found_new_job = 0; sock.get( found_new_job ); if( found_new_job ) { *new_job_ad = new ClassAd(); if( !getClassAd( &sock, *(*new_job_ad) ) ) { error_msg = "Failed to receive new job ClassAd"; delete *new_job_ad; *new_job_ad = NULL; return false; } } if( !sock.end_of_message() ) { error_msg = "Failed to receive end of message"; delete *new_job_ad; *new_job_ad = NULL; return false; } if( *new_job_ad ) { sock.encode(); int ok=1; if( !sock.put(ok) || !sock.end_of_message() ) { error_msg = "Failed to send ok"; delete *new_job_ad; *new_job_ad = NULL; return false; } } return true; }
int main( int argc, char *argv[] ) { char *arg; char **args = (char **)malloc(sizeof(char *)*(argc - 1)); // args int nArgs = 0; // number of args int i; char* cmd_str; DCCollector* pool = NULL; char* scheddName = NULL; char* scheddAddr = NULL; // Initialize our global variables has_constraint = false; myDistro->Init( argc, argv ); MyName = strrchr( argv[0], DIR_DELIM_CHAR ); if( !MyName ) { MyName = argv[0]; } else { MyName++; } cmd_str = strchr( MyName, '_'); // we match modes based on characters after the '_'. This means // 'condor_hold.exe' or 'condor_hold_wrapped' are all legal argv[0]'s // for condor_hold. if (cmd_str && strncasecmp( cmd_str, "_hold", strlen("_hold") ) == MATCH) { mode = JA_HOLD_JOBS; } else if ( cmd_str && strncasecmp( cmd_str, "_release", strlen("_release") ) == MATCH ) { mode = JA_RELEASE_JOBS; } else if ( cmd_str && strncasecmp( cmd_str, "_suspend", strlen("_suspend") ) == MATCH ) { mode = JA_SUSPEND_JOBS; } else if ( cmd_str && strncasecmp( cmd_str, "_continue", strlen("_continue") ) == MATCH ) { mode = JA_CONTINUE_JOBS; }else if ( cmd_str && strncasecmp( cmd_str, "_rm", strlen("_rm") ) == MATCH ) { mode = JA_REMOVE_JOBS; } else if( cmd_str && ! strncasecmp(cmd_str, "_vacate_job", strlen("_vacate_job")) ) { mode = JA_VACATE_JOBS; } else { // don't know what mode we're using, so bail. fprintf( stderr, "Unrecognized command name, \"%s\"\n", MyName ); usage(); } config(); if( argc < 2 ) { // We got no indication of what to act on fprintf( stderr, "You did not specify any jobs\n" ); usage(); } #if !defined(WIN32) install_sig_handler(SIGPIPE, SIG_IGN ); #endif for( argv++; (arg = *argv); argv++ ) { if( arg[0] == '-' ) { if (match_prefix(arg, "-debug")) { // dprintf to console dprintf_set_tool_debug("TOOL", 0); } else if (match_prefix(arg, "-constraint")) { args[nArgs] = arg; nArgs++; argv++; if( ! *argv ) { fprintf( stderr, "%s: -constraint requires another argument\n", MyName); exit(1); } args[nArgs] = *argv; nArgs++; ConstraintArg = true; } else if (match_prefix(arg, "-all")) { All = true; } else if (match_prefix(arg, "-addr")) { argv++; if( ! *argv ) { fprintf( stderr, "%s: -addr requires another argument\n", MyName); exit(1); } if( is_valid_sinful(*argv) ) { scheddAddr = strdup(*argv); if( ! scheddAddr ) { fprintf( stderr, "Out of memory!\n" ); exit(1); } } else { fprintf( stderr, "%s: \"%s\" is not a valid address\n", MyName, *argv ); fprintf( stderr, "Should be of the form " "<ip.address.here:port>\n" ); fprintf( stderr, "For example: <123.456.789.123:6789>\n" ); exit( 1 ); } } else if (match_prefix(arg, "-reason")) { argv++; if( ! *argv ) { fprintf( stderr, "%s: -reason requires another argument\n", MyName); exit(1); } actionReason = strdup(*argv); if( ! actionReason ) { fprintf( stderr, "Out of memory!\n" ); exit(1); } } else if (match_prefix(arg, "-subcode")) { argv++; if( ! *argv ) { fprintf( stderr, "%s: -subcode requires another argument\n", MyName); exit(1); } char *end = NULL; long code = strtol(*argv,&end,10); if( code == LONG_MIN || !end || *end || end==*argv ) { fprintf( stderr, "Invalid -subcode %s!\n", *argv ); exit(1); } holdReasonSubCode = strdup(*argv); ASSERT( holdReasonSubCode ); } else if (match_prefix(arg, "-forcex")) { if( mode == JA_REMOVE_JOBS ) { mode = JA_REMOVE_X_JOBS; } else { fprintf( stderr, "-forcex is only valid with condor_rm\n" ); usage(); } } else if (match_prefix(arg, "-fast")) { if( mode == JA_VACATE_JOBS ) { mode = JA_VACATE_FAST_JOBS; } else { fprintf( stderr, "-fast is only valid with condor_vacate_job\n" ); usage(); } } else if (match_prefix(arg, "-name")) { // use the given name as the schedd name to connect to argv++; if( ! *argv ) { fprintf( stderr, "%s: -name requires another argument\n", MyName); exit(1); } if( !(scheddName = get_daemon_name(*argv)) ) { fprintf( stderr, "%s: unknown host %s\n", MyName, get_host_part(*argv) ); exit(1); } } else if (match_prefix(arg, "-pool")) { // use the given name as the central manager to query argv++; if( ! *argv ) { fprintf( stderr, "%s: -pool requires another argument\n", MyName); exit(1); } if( pool ) { delete pool; } pool = new DCCollector( *argv ); if( ! pool->addr() ) { fprintf( stderr, "%s: %s\n", MyName, pool->error() ); exit(1); } } else if (match_prefix(arg, "-version")) { version(); } else if (match_prefix(arg, "-help")) { usage(0); } else { fprintf( stderr, "Unrecognized option: %s\n", arg ); usage(); } } else { if( All ) { // If -all is set, there should be no other // constraint arguments. usage(); } args[nArgs] = arg; nArgs++; UserJobIdArg = true; } } if( ! (All || nArgs) ) { // We got no indication of what to act on fprintf( stderr, "You did not specify any jobs\n" ); usage(); } if ( ConstraintArg && UserJobIdArg ) { fprintf( stderr, "You can't use both -constraint and usernames or job ids\n" ); usage(); } // Pick the default reason if the user didn't specify one if( actionReason == NULL ) { switch( mode ) { case JA_RELEASE_JOBS: actionReason = strdup("via condor_release"); break; case JA_REMOVE_X_JOBS: actionReason = strdup("via condor_rm -forcex"); break; case JA_REMOVE_JOBS: actionReason = strdup("via condor_rm"); break; case JA_HOLD_JOBS: actionReason = strdup("via condor_hold"); break; case JA_SUSPEND_JOBS: actionReason = strdup("via condor_suspend"); break; case JA_CONTINUE_JOBS: actionReason = strdup("via condor_continue"); break; default: actionReason = NULL; } } // We're done parsing args, now make sure we know how to // contact the schedd. if( ! scheddAddr ) { // This will always do the right thing, even if either or // both of scheddName or pool are NULL. schedd = new DCSchedd( scheddName, pool ? pool->addr() : NULL ); } else { schedd = new DCSchedd( scheddAddr ); } if( ! schedd->locate() ) { fprintf( stderr, "%s: %s\n", MyName, schedd->error() ); exit( 1 ); } // Special case for condor_rm -forcex: a configuration // setting can disable this functionality. The real // validation is done in the schedd, but we can catch // the most common cases here and give a useful error // message. if(mode == JA_REMOVE_X_JOBS) { if( mayUserForceRm() == false) { fprintf( stderr, "Remove aborted. condor_rm -forcex has been disabled by the administrator.\n" ); exit( 1 ); } } // Process the args so we do the work. if( All ) { handleAll(); } else { for(i = 0; i < nArgs; i++) { if( match_prefix( args[i], "-constraint" ) ) { i++; addConstraint( args[i] ); } else { procArg(args[i]); } } } // Deal with all the -constraint constraints handleConstraints(); // Finally, do the actual work for all our args which weren't // constraints... if( job_ids ) { CondorError errstack; ClassAd* result_ad = doWorkByList( job_ids, &errstack ); if (had_error) { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); } printNewMessages( result_ad, job_ids ); delete( result_ad ); } // If releasing jobs, and no errors happened, do a // reschedule command now. if ( mode == JA_RELEASE_JOBS && had_error == false ) { Daemon my_schedd(DT_SCHEDD, NULL, NULL); CondorError errstack; if (!my_schedd.sendCommand(RESCHEDULE, Stream::safe_sock, 0, &errstack)) { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); } } return had_error; }
int main (int argc, char *argv[]) { #if !defined(WIN32) install_sig_handler(SIGPIPE, (SIG_HANDLER)SIG_IGN ); #endif // initialize to read from config file myDistro->Init( argc, argv ); myName = argv[0]; config(); dprintf_config_tool_on_error(0); // The arguments take two passes to process --- the first pass // figures out the mode, after which we can instantiate the required // query object. We add implied constraints from the command line in // the second pass. firstPass (argc, argv); // if the mode has not been set, it is STARTD_NORMAL if (mode == MODE_NOTSET) { setMode (MODE_STARTD_NORMAL, 0, DEFAULT); } // instantiate query object if (!(query = new CondorQuery (type))) { dprintf_WriteOnErrorBuffer(stderr, true); fprintf (stderr, "Error: Out of memory\n"); exit (1); } // if a first-pass setMode set a mode_constraint, apply it now to the query object if (mode_constraint && ! explicit_format) { query->addANDConstraint(mode_constraint); } // set pretty print style implied by the type of entity being queried // but do it with default priority, so that explicitly requested options // can override it switch (type) { #ifdef HAVE_EXT_POSTGRESQL case QUILL_AD: setPPstyle(PP_QUILL_NORMAL, 0, DEFAULT); break; #endif /* HAVE_EXT_POSTGRESQL */ case DEFRAG_AD: setPPstyle(PP_GENERIC_NORMAL, 0, DEFAULT); break; case STARTD_AD: setPPstyle(PP_STARTD_NORMAL, 0, DEFAULT); break; case SCHEDD_AD: setPPstyle(PP_SCHEDD_NORMAL, 0, DEFAULT); break; case MASTER_AD: setPPstyle(PP_MASTER_NORMAL, 0, DEFAULT); break; case CKPT_SRVR_AD: setPPstyle(PP_CKPT_SRVR_NORMAL, 0, DEFAULT); break; case COLLECTOR_AD: setPPstyle(PP_COLLECTOR_NORMAL, 0, DEFAULT); break; case STORAGE_AD: setPPstyle(PP_STORAGE_NORMAL, 0, DEFAULT); break; case NEGOTIATOR_AD: setPPstyle(PP_NEGOTIATOR_NORMAL, 0, DEFAULT); break; case GRID_AD: setPPstyle(PP_GRID_NORMAL, 0, DEFAULT); break; case GENERIC_AD: setPPstyle(PP_GENERIC, 0, DEFAULT); break; case ANY_AD: setPPstyle(PP_ANY_NORMAL, 0, DEFAULT); break; default: setPPstyle(PP_VERBOSE, 0, DEFAULT); } // set the constraints implied by the mode switch (mode) { #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: #endif /* HAVE_EXT_POSTGRESQL */ case MODE_DEFRAG_NORMAL: case MODE_STARTD_NORMAL: case MODE_MASTER_NORMAL: case MODE_CKPT_SRVR_NORMAL: case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: case MODE_COLLECTOR_NORMAL: case MODE_NEGOTIATOR_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: break; case MODE_OTHER: // tell the query object what the type we're querying is query->setGenericQueryType(genericType); free(genericType); genericType = NULL; break; case MODE_STARTD_AVAIL: // For now, -avail shows you machines avail to anyone. sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(unclaimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_RUN: sprintf (buffer, "%s == \"%s\"", ATTR_STATE, state_to_string(claimed_state)); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; case MODE_STARTD_COD: sprintf (buffer, "%s > 0", ATTR_NUM_COD_CLAIMS ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addORConstraint (buffer); break; default: break; } if(javaMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_JAVA ); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_HAS_JAVA); projList.AppendArg(ATTR_JAVA_MFLOPS); projList.AppendArg(ATTR_JAVA_VENDOR); projList.AppendArg(ATTR_JAVA_VERSION); } if(offlineMode) { query->addANDConstraint( "size( OfflineUniverses ) != 0" ); projList.AppendArg( "OfflineUniverses" ); // // Since we can't add a regex to a projection, explicitly list all // the attributes we know about. // projList.AppendArg( "HasVM" ); projList.AppendArg( "VMOfflineReason" ); projList.AppendArg( "VMOfflineTime" ); } if(absentMode) { sprintf( buffer, "%s == TRUE", ATTR_ABSENT ); if (diagnose) { printf( "Adding constraint %s\n", buffer ); } query->addANDConstraint( buffer ); projList.AppendArg( ATTR_ABSENT ); projList.AppendArg( ATTR_LAST_HEARD_FROM ); projList.AppendArg( ATTR_CLASSAD_LIFETIME ); } if(vmMode) { sprintf( buffer, "%s == TRUE", ATTR_HAS_VM); if (diagnose) { printf ("Adding constraint [%s]\n", buffer); } query->addANDConstraint (buffer); projList.AppendArg(ATTR_VM_TYPE); projList.AppendArg(ATTR_VM_MEMORY); projList.AppendArg(ATTR_VM_NETWORKING); projList.AppendArg(ATTR_VM_NETWORKING_TYPES); projList.AppendArg(ATTR_VM_HARDWARE_VT); projList.AppendArg(ATTR_VM_AVAIL_NUM); projList.AppendArg(ATTR_VM_ALL_GUEST_MACS); projList.AppendArg(ATTR_VM_ALL_GUEST_IPS); projList.AppendArg(ATTR_VM_GUEST_MAC); projList.AppendArg(ATTR_VM_GUEST_IP); } // second pass: add regular parameters and constraints if (diagnose) { printf ("----------\n"); } secondPass (argc, argv); // initialize the totals object if (ppStyle == PP_CUSTOM && using_print_format) { if (pmHeadFoot & HF_NOSUMMARY) ppTotalStyle = PP_CUSTOM; } else { ppTotalStyle = ppStyle; } TrackTotals totals(ppTotalStyle); // fetch the query QueryResult q; if ((mode == MODE_STARTD_NORMAL) && (ppStyle == PP_STARTD_NORMAL)) { projList.AppendArg("Name"); projList.AppendArg("Machine"); projList.AppendArg("Opsys"); projList.AppendArg("Arch"); projList.AppendArg("State"); projList.AppendArg("Activity"); projList.AppendArg("LoadAvg"); projList.AppendArg("Memory"); projList.AppendArg("ActvtyTime"); projList.AppendArg("MyCurrentTime"); projList.AppendArg("EnteredCurrentActivity"); } else if( ppStyle == PP_VERBOSE ) { // Remove everything from the projection list if we're displaying // the "long form" of the ads. projList.Clear(); // but if -attributes was supplied, show only those attributes if ( ! dashAttributes.isEmpty()) { const char * s; dashAttributes.rewind(); while ((s = dashAttributes.next())) { projList.AppendArg(s); } } } if( projList.Count() > 0 ) { char **attr_list = projList.GetStringArray(); query->setDesiredAttrs(attr_list); deleteStringArray(attr_list); } // if diagnose was requested, just print the query ad if (diagnose) { ClassAd queryAd; // print diagnostic information about inferred internal state setMode ((Mode) 0, 0, NULL); setType (NULL, 0, NULL); setPPstyle ((ppOption) 0, 0, DEFAULT); printf ("----------\n"); q = query->getQueryAd (queryAd); fPrintAd (stdout, queryAd); printf ("----------\n"); fprintf (stderr, "Result of making query ad was: %d\n", q); exit (1); } // Address (host:port) is taken from requested pool, if given. char* addr = (NULL != pool) ? pool->addr() : NULL; Daemon* requested_daemon = pool; // If we're in "direct" mode, then we attempt to locate the daemon // associated with the requested subsystem (here encoded by value of mode) // In this case the host:port of pool (if given) denotes which // pool is being consulted if( direct ) { Daemon *d = NULL; switch( mode ) { case MODE_MASTER_NORMAL: d = new Daemon( DT_MASTER, direct, addr ); break; case MODE_STARTD_NORMAL: case MODE_STARTD_AVAIL: case MODE_STARTD_RUN: case MODE_STARTD_COD: d = new Daemon( DT_STARTD, direct, addr ); break; #ifdef HAVE_EXT_POSTGRESQL case MODE_QUILL_NORMAL: d = new Daemon( DT_QUILL, direct, addr ); break; #endif /* HAVE_EXT_POSTGRESQL */ case MODE_SCHEDD_NORMAL: case MODE_SCHEDD_SUBMITTORS: d = new Daemon( DT_SCHEDD, direct, addr ); break; case MODE_NEGOTIATOR_NORMAL: d = new Daemon( DT_NEGOTIATOR, direct, addr ); break; case MODE_CKPT_SRVR_NORMAL: case MODE_COLLECTOR_NORMAL: case MODE_LICENSE_NORMAL: case MODE_STORAGE_NORMAL: case MODE_GENERIC_NORMAL: case MODE_ANY_NORMAL: case MODE_OTHER: case MODE_GRID_NORMAL: case MODE_HAD_NORMAL: // These have to go to the collector, anyway. break; default: fprintf( stderr, "Error: Illegal mode %d\n", mode ); exit( 1 ); break; } // Here is where we actually override 'addr', if we can obtain // address of the requested daemon/subsys. If it can't be // located, then fail with error msg. // 'd' will be null (unset) if mode is one of above that must go to // collector (MODE_ANY_NORMAL, MODE_COLLECTOR_NORMAL, etc) if (NULL != d) { if( d->locate() ) { addr = d->addr(); requested_daemon = d; } else { const char* id = d->idStr(); if (NULL == id) id = d->name(); dprintf_WriteOnErrorBuffer(stderr, true); if (NULL == id) id = "daemon"; fprintf(stderr, "Error: Failed to locate %s\n", id); fprintf(stderr, "%s\n", d->error()); exit( 1 ); } } } ClassAdList result; CondorError errstack; if (NULL != ads_file) { MyString req; // query requirements q = query->getRequirements(req); const char * constraint = req.empty() ? NULL : req.c_str(); if (read_classad_file(ads_file, result, constraint)) { q = Q_OK; } } else if (NULL != addr) { // this case executes if pool was provided, or if in "direct" mode with // subsystem that corresponds to a daemon (above). // Here 'addr' represents either the host:port of requested pool, or // alternatively the host:port of daemon associated with requested subsystem (direct mode) q = query->fetchAds (result, addr, &errstack); } else { // otherwise obtain list of collectors and submit query that way CollectorList * collectors = CollectorList::create(); q = collectors->query (*query, result, &errstack); delete collectors; } // if any error was encountered during the query, report it and exit if (Q_OK != q) { dprintf_WriteOnErrorBuffer(stderr, true); // we can always provide these messages: fprintf( stderr, "Error: %s\n", getStrQueryResult(q) ); fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if ((NULL != requested_daemon) && ((Q_NO_COLLECTOR_HOST == q) || (requested_daemon->type() == DT_COLLECTOR))) { // Specific long message if connection to collector failed. const char* fullhost = requested_daemon->fullHostname(); if (NULL == fullhost) fullhost = "<unknown_host>"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; char info[1000]; sprintf(info, "%s (%s)", fullhost, daddr); printNoCollectorContact( stderr, info, !expert ); } else if ((NULL != requested_daemon) && (Q_COMMUNICATION_ERROR == q)) { // more helpful message for failure to connect to some daemon/subsys const char* id = requested_daemon->idStr(); if (NULL == id) id = requested_daemon->name(); if (NULL == id) id = "daemon"; const char* daddr = requested_daemon->addr(); if (NULL == daddr) daddr = "<unknown>"; fprintf(stderr, "Error: Failed to contact %s at %s\n", id, daddr); } // fail exit (1); } if (noSort) { // do nothing } else if (sortSpecs.empty()) { // default classad sorting result.Sort((SortFunctionType)lessThanFunc); } else { // User requested custom sorting expressions: // insert attributes related to custom sorting result.Open(); while (ClassAd* ad = result.Next()) { for (vector<SortSpec>::iterator ss(sortSpecs.begin()); ss != sortSpecs.end(); ++ss) { ss->expr->SetParentScope(ad); classad::Value v; ss->expr->Evaluate(v); stringstream vs; // This will properly render all supported value types, // including undefined and error, although current semantic // pre-filters classads where sort expressions are undef/err: vs << ((v.IsStringValue())?"\"":"") << v << ((v.IsStringValue())?"\"":""); ad->AssignExpr(ss->keyAttr.c_str(), vs.str().c_str()); // Save the full expr in case user wants to examine on output: ad->AssignExpr(ss->keyExprAttr.c_str(), ss->arg.c_str()); } } result.Open(); result.Sort((SortFunctionType)customLessThanFunc); } // output result prettyPrint (result, &totals); delete query; return 0; }
int main(int argc, char *argv[]) { char *arg; int nArgs = 0; // number of args int i, result; char* pool = NULL; char* scheddName = NULL; char* scheddAddr = NULL; MyString method; char *tmp; myDistro->Init( argc, argv ); MyName = condor_basename(argv[0]); config(); #if !defined(WIN32) install_sig_handler(SIGPIPE, SIG_IGN ); #endif // dig around in the config file looking for what the config file says // about getting files from Condor. This defaults with the global variable // initialization. tmp = param( "SANDBOX_TRANSFER_METHOD" ); if ( tmp != NULL ) { method = tmp; free( tmp ); string_to_stm( method, st_method ); } char **args = (char **)malloc(sizeof(char *) * argc); // args if ( ! args) exit(2); // parse the arguments. for( argv++; (arg = *argv); argv++ ) { if( arg[0] == '-' ) { if( ! arg[1] ) { usage(); } switch( arg[1] ) { case 'd': // dprintf to console dprintf_set_tool_debug("TOOL", 0); break; case 'c': args[nArgs] = arg; nArgs++; argv++; if( ! *argv ) { fprintf( stderr, "%s: -constraint requires another argument\n", MyName); exit(1); } args[nArgs] = *argv; nArgs++; break; case 'a': if( arg[2] && arg[2] == 'd' ) { argv++; if( ! *argv ) { fprintf( stderr, "%s: -addr requires another argument\n", MyName); exit(1); } if( is_valid_sinful(*argv) ) { scheddAddr = strdup(*argv); if( ! scheddAddr ) { fprintf( stderr, "Out of Memory!\n" ); exit(1); } } else { fprintf( stderr, "%s: \"%s\" is not a valid address\n", MyName, *argv ); fprintf( stderr, "Should be of the form " "<ip.address.here:port>\n" ); fprintf( stderr, "For example: <123.456.789.123:6789>\n" ); exit( 1 ); } break; } All = true; break; case 'n': // use the given name as the schedd name to connect to argv++; if( ! *argv ) { fprintf( stderr, "%s: -name requires another argument\n", MyName); exit(1); } if ( scheddName ) free(scheddName); scheddName = strdup(*argv); break; case 'p': // use the given name as the central manager to query argv++; if( ! *argv ) { fprintf( stderr, "%s: -pool requires another argument\n", MyName); exit(1); } if( pool ) { free( pool ); } pool = strdup( *argv ); break; case 's': argv++; if( ! *argv ) { fprintf( stderr, "%s: -stm requires another argument\n", MyName); exit(1); } method = *argv; string_to_stm(method, st_method); break; case 'v': version(); break; case 'h': usage(0); break; default: fprintf( stderr, "Unrecognized option: %s\n", arg ); usage(); break; } } else { if( All ) { // If -all is set, there should be no other // constraint arguments. usage(); } args[nArgs] = arg; nArgs++; } } // Check to make sure we have a valid sandbox transfer mechanism. if (st_method == STM_UNKNOWN) { fprintf( stderr, "%s: Unknown sandbox transfer method: %s\n", MyName, method.Value()); usage(); exit(1); } if( ! (All || nArgs) ) { // We got no indication of what to act on fprintf( stderr, "You did not specify any jobs\n" ); usage(); } // We're done parsing args, now make sure we know how to // contact the schedd. if( ! scheddAddr ) { // This will always do the right thing, even if either or // both of scheddName or pool are NULL. schedd = new DCSchedd( scheddName, pool ); } else { schedd = new DCSchedd( scheddAddr ); } if( ! schedd->locate() ) { fprintf( stderr, "%s: %s\n", MyName, schedd->error() ); exit( 1 ); } // Process the args. if( All ) { handleAll(); } else { for(i = 0; i < nArgs; i++) { if( match_prefix( args[i], "-constraint" ) ) { i++; addConstraint( args[i] ); } else { procArg(args[i]); } } } // Sanity check: make certain we now have a constraint if ( global_constraint.Length() <= 0 ) { fprintf( stderr, "Unable to create a job constraint!\n"); exit(1); } fprintf(stdout,"Fetching data files...\n"); switch(st_method) { case STM_USE_SCHEDD_ONLY: { // start block // Get the sandbox directly from the schedd. // And now, do the work. CondorError errstack; result = schedd->receiveJobSandbox(global_constraint.Value(), &errstack); if ( !result ) { fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() ); fprintf( stderr, "ERROR: Failed to spool job files.\n" ); exit(1); } // All done return 0; } //end block break; case STM_USE_TRANSFERD: { // start block // NEW METHOD where we ask the schedd for a transferd, then get the // files from the transferd CondorError errstack; ClassAd respad; int invalid; MyString reason; MyString td_sinful; MyString td_cap; result = schedd->requestSandboxLocation(FTPD_DOWNLOAD, global_constraint, FTP_CFTP, &respad, &errstack); if ( !result ) { fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() ); fprintf( stderr, "ERROR: Failed to spool job files.\n" ); exit(1); } respad.LookupInteger(ATTR_TREQ_INVALID_REQUEST, invalid); if (invalid == TRUE) { fprintf( stderr, "ERROR: Failed to spool job files.\n" ); respad.LookupString(ATTR_TREQ_INVALID_REASON, reason); fprintf( stderr, "%s\n", reason.Value()); exit(EXIT_FAILURE); } respad.LookupString(ATTR_TREQ_TD_SINFUL, td_sinful); respad.LookupString(ATTR_TREQ_CAPABILITY, td_cap); dprintf(D_ALWAYS, "td: %s, cap: %s\n", td_sinful.Value(), td_cap.Value()); DCTransferD dctd(td_sinful.Value()); result = dctd.download_job_files(&respad, &errstack); if ( !result ) { fprintf( stderr, "\n%s\n", errstack.getFullText(true).c_str() ); fprintf( stderr, "ERROR: Failed to spool job files.\n" ); exit(1); } } // end block break; default: EXCEPT("PROGRAMMER ERROR: st_method must be known."); break; } // All done return 0; }
//--------------------------------------------------------------------------- bool Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader, bool nfsIsError, bool recovery, const char *defaultNodeLog, bool usingDefault ) { debug_printf( DEBUG_DEBUG_2, "Attempting to monitor log file for node %s\n", GetJobName() ); if ( _logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already monitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; std::string logFileStr; if ( _jobType == TYPE_CONDOR ) { // We check to see if the user has specified a log file // If not, we give him a default MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile, _directory, _logFileIsXml, usingDefault); logFileStr = templogFileStr.Value(); } else { StringList logFiles; MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile( _cmdFile, _directory, logFiles ); if ( tmpResult != "" ) { debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n", tmpResult.Value() ); LogMonitorFailed(); return false; } else if ( logFiles.number() != 1 ) { debug_printf( DEBUG_QUIET, "Error: %d Stork log files found " "in submit file %s; we want 1\n", logFiles.number(), _cmdFile ); LogMonitorFailed(); return false; } else { logFiles.rewind(); logFileStr = logFiles.next(); } } // Warn the user if the node's log file is in /tmp. if ( logFileStr.find( "/tmp" ) == 0 ) { debug_printf( DEBUG_QUIET, "Warning: " "Log file %s for node %s is in /tmp\n", logFileStr.c_str(), GetJobName() ); check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 ); } if ( logFileStr == "" ) { logFileStr = defaultNodeLog; _useDefaultLog = true; // Default User log is never XML // This could be specified in the submit file and should be // ignored. _logFileIsXml = false; debug_printf( DEBUG_NORMAL, "Unable to get log file from " "submit file %s (node %s); using default (%s)\n", _cmdFile, GetJobName(), logFileStr.c_str() ); append_default_log = false; } else { append_default_log = usingDefault; if( append_default_log ) { // DAGman is not going to look at the user-specified log. // It will look at the defaultNode log. logFileStr = defaultNodeLog; _useDefaultLog = false; _logFileIsXml = false; } } // This function returns true if the log file is on NFS and // that is an error. If the log file is on NFS, but nfsIsError // is false, it prints a warning but returns false. if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(), nfsIsError ) ) { debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n", logFileStr.c_str() ); LogMonitorFailed(); return false; } delete [] _logFile; // Saving log file here in case submit file gets changed. _logFile = strnewp( logFileStr.c_str() ); debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to monitor log file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); LogMonitorFailed(); EXCEPT( "Fatal log file monitoring error!\n" ); return false; } _logIsMonitored = true; return true; }
void doContactSchedd() { if (command_queue.IsEmpty()) { daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); // Come back in a min return; } dprintf(D_FULLDEBUG,"in doContactSchedd\n"); SchedDRequest * current_command = NULL; int error=FALSE; std::string error_msg; CondorError errstack; bool do_reschedule = false; int failure_line_num = 0; int failure_errno = 0; // Try connecting to schedd DCSchedd dc_schedd ( ScheddAddr, ScheddPool ); if (dc_schedd.error() || !dc_schedd.locate()) { sprintf( error_msg, "Error locating schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); // If you can't connect return "Failure" on every job request command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command == SchedDRequest::SDC_STATUS_CONSTRAINED) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0"}; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_SUBMIT_JOB) { const char * result[] = { GAHP_RESULT_FAILURE, NULL, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); } else if (current_command->command == SchedDRequest::SDC_UPDATE_LEASE) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } current_command->status = SchedDRequest::SDCS_COMPLETED; } } SchedDRequest::schedd_command_type commands [] = { SchedDRequest::SDC_REMOVE_JOB, SchedDRequest::SDC_HOLD_JOB, SchedDRequest::SDC_RELEASE_JOB }; const char * command_titles [] = { "REMOVE_JOB", "HOLD_JOB", "RELEASE_JOB" }; // REMOVE // HOLD // RELEASE int i=0; while (i<3) { StringList id_list; SimpleList <SchedDRequest*> this_batch; SchedDRequest::schedd_command_type this_command = commands[i]; const char * this_action = command_titles[i]; const char * this_reason = NULL; dprintf (D_FULLDEBUG, "Processing %s requests\n", this_action); error = FALSE; // Create a batch of commands with the same command type AND the same reason command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != this_command) continue; if ((this_reason != NULL) && (strcmp (current_command->reason, this_reason) != 0)) continue; if (this_reason == NULL) this_reason = current_command->reason; char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", current_command->cluster_id, current_command->proc_id); id_list.append (job_id_buff); this_batch.Append (current_command); } // If we haven't found any.... if (id_list.isEmpty()) { i++; continue; // ... then try the next command } // Perform the appropriate command on the current batch ClassAd * result_ad= NULL; if (this_command == SchedDRequest::SDC_REMOVE_JOB) { errstack.clear(); result_ad= dc_schedd.removeJobs ( &id_list, this_reason, &errstack); } else if (this_command == SchedDRequest::SDC_HOLD_JOB) { errstack.clear(); result_ad= dc_schedd.holdJobs ( &id_list, this_reason, NULL, &errstack); } else if (this_command == SchedDRequest::SDC_RELEASE_JOB) { errstack.clear(); result_ad= dc_schedd.releaseJobs ( &id_list, this_reason, &errstack); } else { EXCEPT( "Unexpected command type %d in doContactSchedd", this_command ); } // Analyze the result ad if (!result_ad) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s %s: %s", ScheddAddr, dc_schedd.addr(), errstack.getFullText() ); } else { result_ad->dPrint (D_FULLDEBUG); if ( this_command == SchedDRequest::SDC_RELEASE_JOB ) { do_reschedule = true; } } // Go through the batch again, and create responses for each request this_batch.Rewind(); while (this_batch.Next(current_command)) { // Check the result char job_id_buff[30]; if (result_ad && (error == FALSE)) { sprintf (job_id_buff, "job_%d_%d", current_command->cluster_id, current_command->proc_id); int remove_result; if (result_ad->LookupInteger (job_id_buff, remove_result)) { switch (remove_result) { case AR_ERROR: error = TRUE; error_msg = "General Error"; break; case AR_SUCCESS: error = FALSE; break; case AR_NOT_FOUND: error = TRUE; error_msg = "Job not found"; break; case AR_BAD_STATUS: error = TRUE; error_msg = "Bad job status"; break; case AR_ALREADY_DONE: error = TRUE; error_msg = "Already done"; break; case AR_PERMISSION_DENIED: error = TRUE; error_msg = "Permission denied"; break; default: error = TRUE; error_msg = "Unknown Result"; } // hctiws } else { error_msg = "Unable to get result"; } // fi lookup result for job } // fi error == FALSE if (error) { dprintf (D_ALWAYS, "Error (operation: %s) %d.%d: %s\n", this_action, current_command->cluster_id, current_command->proc_id, error_msg.c_str()); const char * result[2]; result[0] = GAHP_RESULT_FAILURE; result[1] = error_msg.c_str(); enqueue_result (current_command->request_id, result, 2); } else { dprintf (D_ALWAYS, "Succeess (operation: %s) %d.%d\n", this_action, current_command->cluster_id, current_command->proc_id); const char * result[2]; result[0] = GAHP_RESULT_SUCCESS; result[1] = NULL; enqueue_result (current_command->request_id, result, 2); } // fi error // Mark the status current_command->status = SchedDRequest::SDCS_COMPLETED; } // elihw this_batch if ( result_ad ) { delete result_ad; } } dprintf (D_FULLDEBUG, "Processing JOB_STAGE_IN requests\n"); // JOB_STAGE_IN int MAX_BATCH_SIZE=1; // This should be a config param SimpleList <SchedDRequest*> stage_in_batch; do { stage_in_batch.Clear(); command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_IN) continue; dprintf (D_ALWAYS, "Adding %d.%d to STAGE_IN batch\n", current_command->cluster_id, current_command->proc_id); stage_in_batch.Append (current_command); if (stage_in_batch.Number() >= MAX_BATCH_SIZE) break; } if (stage_in_batch.Number() > 0) { ClassAd ** array = new ClassAd*[stage_in_batch.Number()]; i=0; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { array[i++] = current_command->classad; } error = FALSE; errstack.clear(); if (!dc_schedd.spoolJobFiles( stage_in_batch.Number(), array, &errstack )) { error = TRUE; sprintf( error_msg, "Error sending files to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } delete [] array; stage_in_batch.Rewind(); while (stage_in_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_IN requests } while (stage_in_batch.Number() > 0); dprintf (D_FULLDEBUG, "Processing JOB_STAGE_OUT requests\n"); // JOB_STAGE_OUT SimpleList <SchedDRequest*> stage_out_batch; command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_STAGE_OUT) continue; stage_out_batch.Append (current_command); } if (stage_out_batch.Number() > 0) { std::string constraint = ""; stage_out_batch.Rewind(); int jobsexpected = stage_out_batch.Number(); while (stage_out_batch.Next(current_command)) { sprintf_cat( constraint, "(ClusterId==%d&&ProcId==%d)||", current_command->cluster_id, current_command->proc_id ); } constraint += "False"; error = FALSE; errstack.clear(); int jobssent; if (!dc_schedd.receiveJobSandbox( constraint.c_str(), &errstack, &jobssent )) { error = TRUE; sprintf( error_msg, "Error receiving files from schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if(error == FALSE && jobssent != jobsexpected) { error = TRUE; sprintf( error_msg, "Schedd %s didn't send expected files", ScheddAddr ); dprintf (D_ALWAYS, "Transfered files for %d jobs but got files for %d jobs. (Schedd %s with contraint %s\n", jobsexpected, jobssent, ScheddAddr, constraint.c_str()); } stage_out_batch.Rewind(); while (stage_out_batch.Next(current_command)) { current_command->status = SchedDRequest::SDCS_COMPLETED; if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); } else { const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); } } // elihw (command_queue) } // fi has STAGE_OUT requests dprintf (D_FULLDEBUG, "Processing JOB_REFRESH_PROXY requests\n"); CondorVersionInfo ver_info(dc_schedd.version()); bool delegate_credential; if ( ver_info.built_since_version(6,7,19) && param_boolean( "DELEGATE_JOB_GSI_CREDENTIALS", true ) ) { delegate_credential = true; } else { delegate_credential = false; } // JOB_REFRESH_PROXY command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_JOB_REFRESH_PROXY) continue; time_t expiration_time = GetDesiredDelegatedJobCredentialExpiration(current_command->classad); time_t result_expiration_time = 0; bool result; errstack.clear(); if ( delegate_credential ) { result = dc_schedd.delegateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, expiration_time, &result_expiration_time, &errstack ); // Currently, we do not propagate the actual resulting // expiration time back to the gridmanager. We // probably should. } else { result = dc_schedd.updateGSIcredential( current_command->cluster_id, current_command->proc_id, current_command->proxy_file, &errstack ); } current_command->status = SchedDRequest::SDCS_COMPLETED; if (result == false) { sprintf( error_msg, "Error refreshing proxy to schedd %s: %s", ScheddAddr, errstack.getFullText() ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); const char * result_to_queue[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result_to_queue, 2); } else { const char * result_to_queue[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result_to_queue, 2); } } // Now do all the QMGMT transactions error = FALSE; // Try connecting to the queue Qmgr_connection * qmgr_connection; if ((qmgr_connection = ConnectQ(dc_schedd.addr(), QMGMT_TIMEOUT, false, NULL, NULL, dc_schedd.version() )) == NULL) { error = TRUE; sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else { errno = 0; AbortTransaction(); // Just so we can call BeginTransaction() in the loop if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } dprintf (D_FULLDEBUG, "Processing UPDATE_CONSTRAINED/UDATE_JOB requests\n"); // UPDATE_CONSTRAINED // UDATE_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if ((current_command->command != SchedDRequest::SDC_UPDATE_CONSTRAINED) && (current_command->command != SchedDRequest::SDC_UPDATE_JOB)) continue; if (qmgr_connection == NULL) goto update_report_result; error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else { if (current_command->command == SchedDRequest::SDC_UPDATE_CONSTRAINED) { if( SetAttributeByConstraint(current_command->constraint, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed (errno=%d) to SetAttributeByConstraint %s=%s for constraint %s", errno, lhstr, rhstr, current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } else if (current_command->command == SchedDRequest::SDC_UPDATE_JOB) { if( SetAttribute(current_command->cluster_id, current_command->proc_id, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute() %s=%s for job %d.%d", lhstr, rhstr, current_command->cluster_id, current_command->proc_id); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } } if (error) break; } // elihw classad update_report_result: if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw dprintf (D_FULLDEBUG, "Processing UPDATE_LEASE requests\n"); // UPDATE_LEASE command_queue.Rewind(); while (command_queue.Next(current_command)) { error = FALSE; if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_UPDATE_LEASE) continue; std::string success_job_ids=""; if (qmgr_connection == NULL) { sprintf( error_msg, "Error connecting to schedd %s", ScheddAddr ); error = TRUE; } else { error = FALSE; errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } for (i=0; i<current_command->num_jobs; i++) { time_t time_now = time(NULL); int duration = current_command->expirations[i].expiration - time_now; dprintf (D_FULLDEBUG, "Job %d.%d SetTimerAttribute=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, duration); if (SetTimerAttribute (current_command->expirations[i].cluster, current_command->expirations[i].proc, ATTR_TIMER_REMOVE_CHECK, duration) < 0) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } dprintf (D_ALWAYS, "Unable to SetTimerAttribute(%d, %d), errno=%d\n", current_command->expirations[i].cluster, current_command->expirations[i].proc, errno); } else { // Append job id to the result line if (success_job_ids.length() > 0) success_job_ids += ","; sprintf_cat( success_job_ids, "%d.%d", current_command->expirations[i].cluster, current_command->expirations[i].proc); } } //rof jobs for request } // fi error if (error) { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, NULL, success_job_ids.length()?success_job_ids.c_str():NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } // fi } // elihw UPDATE_LEASE requests dprintf (D_FULLDEBUG, "Processing SUBMIT_JOB requests\n"); // SUBMIT_JOB command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_SUBMIT_JOB) continue; int ClusterId = -1; int ProcId = -1; if (qmgr_connection == NULL) { error = TRUE; goto submit_report_result; } errno = 0; BeginTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } error = FALSE; if ((ClusterId = NewCluster()) >= 0) { ProcId = NewProc (ClusterId); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } if ( ClusterId < 0 ) { error = TRUE; error_msg = "Unable to create a new job cluster"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } else if ( ProcId < 0 ) { error = TRUE; error_msg = "Unable to create a new job proc"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } if ( ClusterId == -2 || ProcId == -2 ) { error = TRUE; error_msg = "Number of submitted jobs would exceed MAX_JOBS_SUBMITTED\n"; dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); } // Adjust the argument/environment syntax based on the version // of the schedd we are talking to. if( error == FALSE) { CondorVersionInfo version_info(dc_schedd.version()); ArgList arglist; MyString arg_error_msg; Env env_obj; MyString env_error_msg; if(!arglist.AppendArgsFromClassAd(current_command->classad,&arg_error_msg) || ! arglist.InsertArgsIntoClassAd(current_command->classad,&version_info,&arg_error_msg)) { sprintf( error_msg, "ERROR: ClassAd problem in converting arguments to syntax " "for schedd (version=%s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", arg_error_msg.Value()); dprintf( D_ALWAYS,"%s\n", error_msg.c_str() ); error = TRUE; } if(!env_obj.MergeFrom(current_command->classad,&env_error_msg) || !env_obj.InsertEnvIntoClassAd(current_command->classad,&env_error_msg,NULL,&version_info)) { sprintf( error_msg, "ERROR: Failed to convert environment to target syntax" " for schedd (version %s): %s\n", dc_schedd.version() ? dc_schedd.version() : "NULL", env_error_msg.Value()); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } } if( error == FALSE ) { // See the comment in the function body of ExpandInputFileList // for an explanation of what is going on here. MyString transfer_input_error_msg; if( !FileTransfer::ExpandInputFileList( current_command->classad, transfer_input_error_msg ) ) { dprintf( D_ALWAYS, "%s\n", transfer_input_error_msg.Value() ); error = TRUE; } } if ( error == FALSE ) { current_command->classad->Assign(ATTR_CLUSTER_ID, ClusterId); current_command->classad->Assign(ATTR_PROC_ID, ProcId); // Special case for the job lease int expire_time; if ( current_command->classad->LookupInteger( ATTR_TIMER_REMOVE_CHECK, expire_time ) ) { if ( SetTimerAttribute( ClusterId, ProcId, ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL) ) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetTimerAttribute %s=%ld for job %d.%d", ATTR_TIMER_REMOVE_CHECK, expire_time - time(NULL), ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; goto submit_report_result; } current_command->classad->Delete( ATTR_TIMER_REMOVE_CHECK ); } // Set all the classad attribute on the remote classad current_command->classad->ResetExpr(); ExprTree *tree; const char *lhstr, *rhstr; while( current_command->classad->NextExpr(lhstr, tree) ) { rhstr = ExprTreeToString( tree ); if( !lhstr || !rhstr) { sprintf( error_msg, "ERROR: ClassAd problem in Updating by constraint %s", current_command->constraint ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } else if( SetAttribute (ClusterId, ProcId, lhstr, rhstr) == -1 ) { if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } sprintf( error_msg, "ERROR: Failed to SetAttribute %s=%s for job %d.%d", lhstr, rhstr, ClusterId, ProcId ); dprintf( D_ALWAYS, "%s\n", error_msg.c_str() ); error = TRUE; } if (error) break; } // elihw classad } // fi error==FALSE submit_report_result: char job_id_buff[30]; sprintf (job_id_buff, "%d.%d", ClusterId, ProcId); if (error) { const char * result[] = { GAHP_RESULT_FAILURE, job_id_buff, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); if ( qmgr_connection != NULL ) { errno = 0; AbortTransaction(); if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } } current_command->status = SchedDRequest::SDCS_COMPLETED; } else { if ( RemoteCommitTransaction() < 0 ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } const char * result[] = { GAHP_RESULT_SUCCESS, job_id_buff, NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } // elihw dprintf (D_FULLDEBUG, "Processing STATUS_CONSTRAINED requests\n"); // STATUS_CONSTRAINED command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status != SchedDRequest::SDCS_NEW) continue; if (current_command->command != SchedDRequest::SDC_STATUS_CONSTRAINED) continue; if (qmgr_connection != NULL) { SimpleList <MyString *> matching_ads; error = FALSE; ClassAd *next_ad; ClassAdList adlist; // Only use GetAllJobsByConstraint if remote schedd is // 6.9.5 or newer. Previous versions either did not // support this call, or they closed the Qmgmt connection // as a side-effect of this call. if( ver_info.built_since_version(6,9,5) ) { dprintf( D_FULLDEBUG, "Calling GetAllJobsByConstraint(%s)\n", current_command->constraint ); // NOTE: this could be made more efficient if we knew // the list of attributes to query. For lack of that, // we just get all attributes. GetAllJobsByConstraint( current_command->constraint, "", adlist); } else { // This is the old latency-prone method. dprintf( D_FULLDEBUG, "Calling GetNextJobByConstraint(%s)\n", current_command->constraint ); next_ad = GetNextJobByConstraint( current_command->constraint, 1 ); while( next_ad != NULL ) { adlist.Insert( next_ad ); next_ad = GetNextJobByConstraint( current_command->constraint, 0 ); } } // NOTE: ClassAdList will deallocate the ClassAds in it adlist.Rewind(); while( (next_ad=adlist.Next()) ) { MyString * da_buffer = new MyString(); // Use a ptr to avoid excessive copying if ( useXMLClassads ) { ClassAdXMLUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } else { NewClassAdUnparser unparser; unparser.SetUseCompactSpacing(true); unparser.Unparse (next_ad, *da_buffer); } matching_ads.Append (da_buffer); } if ( errno == ETIMEDOUT ) { failure_line_num = __LINE__; failure_errno = errno; goto contact_schedd_disconnect; } // now output this list of classads into a result const char ** result = new const char* [matching_ads.Length() + 3]; std::string _ad_count; sprintf( _ad_count, "%d", matching_ads.Length() ); int count=0; result[count++] = GAHP_RESULT_SUCCESS; result[count++] = NULL; result[count++] = _ad_count.c_str(); MyString *next_string; matching_ads.Rewind(); while (matching_ads.Next(next_string)) { result[count++] = next_string->Value(); } enqueue_result (current_command->request_id, result, count); current_command->status = SchedDRequest::SDCS_COMPLETED; // Cleanup matching_ads.Rewind(); while (matching_ads.Next(next_string)) { delete next_string; } //CommitTransaction(); delete [] result; } else { const char * result[] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; //RemoteCommitTransaction(); enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } } //elihw contact_schedd_disconnect: if ( qmgr_connection != NULL ) { DisconnectQ (qmgr_connection, FALSE); } if ( failure_line_num ) { // We had an error talking to the schedd. Take all of our // incomplete commands and mark them as failed. // TODO Consider retrying these commands, rather than // immediately marking them as failed. if ( failure_errno == ETIMEDOUT ) { dprintf( D_ALWAYS, "Timed out talking to schedd at line %d in " "doContactSchedd()\n", failure_line_num ); sprintf( error_msg, "Timed out talking to schedd" ); } else { dprintf( D_ALWAYS, "Error talking to schedd at line %d in " "doContactSchedd(), errno=%d (%s)\n", failure_line_num, failure_errno, strerror(failure_errno) ); sprintf( error_msg, "Error talking to schedd" ); } command_queue.Rewind(); while (command_queue.Next(current_command)) { if ( current_command->status != SchedDRequest::SDCS_NEW ) { continue; } switch( current_command->command ) { case SchedDRequest::SDC_UPDATE_JOB: case SchedDRequest::SDC_UPDATE_CONSTRAINED: { const char *result[2] = { GAHP_RESULT_FAILURE, error_msg.c_str() }; enqueue_result (current_command->request_id, result, 2); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_UPDATE_LEASE: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), NULL }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_SUBMIT_JOB: { const char *result[3] = { GAHP_RESULT_FAILURE, "-1.-1", error_msg.c_str() }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; case SchedDRequest::SDC_STATUS_CONSTRAINED: { const char *result[3] = { GAHP_RESULT_FAILURE, error_msg.c_str(), "0" }; enqueue_result (current_command->request_id, result, 3); current_command->status = SchedDRequest::SDCS_COMPLETED; } break; default: // Do nothing ; } } } if ( do_reschedule ) { dc_schedd.reschedule(); } // Write all of our results to our parent. flush_results(); dprintf (D_FULLDEBUG, "Finishing doContactSchedd()\n"); // Clean up the list command_queue.Rewind(); while (command_queue.Next(current_command)) { if (current_command->status == SchedDRequest::SDCS_COMPLETED) { command_queue.DeleteCurrent(); delete current_command; } } // Come back soon.. // QUESTION: Should this always be a fixed time period? daemonCore->Reset_Timer( contactScheddTid, contact_schedd_interval ); }
void printClassAd( void ) { printf( "%s = \"%s\"\n", ATTR_VERSION, CondorVersion() ); printf( "%s = True\n", ATTR_IS_DAEMON_CORE ); printf( "%s = True\n", ATTR_HAS_FILE_TRANSFER ); printf( "%s = True\n", ATTR_HAS_PER_FILE_ENCRYPTION ); printf( "%s = True\n", ATTR_HAS_RECONNECT ); printf( "%s = True\n", ATTR_HAS_MPI ); printf( "%s = True\n", ATTR_HAS_TDP ); printf( "%s = True\n", ATTR_HAS_JOB_DEFERRAL ); /* Attributes describing what kinds of Job Info Communicators this starter has. This is mostly for COD, but someday might be useful to other people, too. There's no need to advertise the fact we've got a JICShadow, since all starters always have and will be able to communicate with a shadow... */ printf( "%s = True\n", ATTR_HAS_JIC_LOCAL_CONFIG ); printf( "%s = True\n", ATTR_HAS_JIC_LOCAL_STDIN ); ClassAd *ad = java_detect(); if(ad) { int gotone=0; float mflops; char *str = 0; if(ad->LookupString(ATTR_JAVA_VENDOR,&str)) { printf("%s = \"%s\"\n",ATTR_JAVA_VENDOR,str); free(str); str = 0; gotone++; } if(ad->LookupString(ATTR_JAVA_VERSION,&str)) { printf("%s = \"%s\"\n",ATTR_JAVA_VERSION,str); free(str); str = 0; gotone++; } if(ad->LookupString("JavaSpecificationVersion",&str)) { printf("JavaSpecificationVersion = \"%s\"\n",str); free(str); str = 0; gotone++; } if(ad->LookupFloat(ATTR_JAVA_MFLOPS,mflops)) { printf("%s = %f\n", ATTR_JAVA_MFLOPS,mflops); gotone++; } if(gotone>0) printf( "%s = True\n",ATTR_HAS_JAVA); delete ad; } // VM universe stuff if( VMProc::vm_univ_detect() ) { // This doesn't mean that vm universe is really available. // This just means that starter has codes for vm universe. // Actual testing for vm universe will be // done by vmuniverse manager in startd. // ATTR_HAS_VM may be overwritten by vmuniverse manager in startd printf( "%s = True\n",ATTR_HAS_VM); } // Advertise which file transfer plugins are supported FileTransfer ft; CondorError e; ft.InitializePlugins(e); if (e.code()) { dprintf(D_ALWAYS, "WARNING: Initializing plugins returned: %s\n", e.getFullText().c_str()); } MyString method_list = ft.GetSupportedMethods(); if (!method_list.IsEmpty()) { printf("%s = \"%s\"\n", ATTR_HAS_FILE_TRANSFER_PLUGIN_METHODS, method_list.Value()); } #if defined(WIN32) // Advertise our ability to run jobs as the submitting user printf("%s = True\n", ATTR_HAS_WIN_RUN_AS_OWNER); #endif }
bool Triggerd::PerformQueries() { ClassAdList result; CondorError errstack; QueryResult status; Trigger* trig = NULL; CondorQuery* query; bool ret_val = true; std::map<uint32_t,Trigger*>::iterator iter; ClassAd* ad = NULL; std::string eventText; char* token = NULL; std::string triggerText; char* queryString = NULL; ExprTree* attr = NULL; std::list<std::string> missing_nodes; size_t pos; size_t prev_pos; bool bad_trigger = false; const char* token_str = NULL; if (0 < triggers.size()) { dprintf(D_FULLDEBUG, "Triggerd: Evaluating %d triggers\n", (int)triggers.size()); query = new CondorQuery(ANY_AD); for (iter = triggers.begin(); iter != triggers.end(); iter++) { // Clear any pre-exhisting custom contraints and add the constraint // for this trigger trig = iter->second; query->clearORCustomConstraints(); query->clearANDCustomConstraints(); queryString = strdup(trig->GetQuery().c_str()); ReplaceAllChars(queryString, '\'', '"'); query->addANDConstraint(queryString); free(queryString); // Perform the query and check the result if (NULL != query_collector) { status = query->fetchAds(result, query_collector->addr(), &errstack); } else { status = collectors->query(*query, result, &errstack); } if (Q_OK != status) { // Problem with the query if (Q_COMMUNICATION_ERROR == status) { dprintf(D_ALWAYS, "Triggerd Error: Error contacting the collecter - %s\n", errstack.getFullText(true).c_str()); if (CEDAR_ERR_CONNECT_FAILED == errstack.code(0)) { dprintf(D_ALWAYS, "Triggerd Error: Couldn't contact the collector on the central manager\n"); } } else { dprintf(D_ALWAYS, "Triggerd Error: Could not retrieve ads - %s\n", getStrQueryResult(status)); } ret_val = false; break; } else { dprintf(D_FULLDEBUG, "Query successful. Parsing results\n"); // Query was successful, so parse the results result.Open(); while ((ad = result.Next())) { if (true == bad_trigger) { // Avoid processing a bad trigger multiple times. Remove // all result ads and reset the flag dprintf(D_FULLDEBUG, "Cleaning up after a bad trigger\n"); result.Delete(ad); while ((ad = result.Next())) { result.Delete(ad); } bad_trigger = false; break; } eventText = ""; triggerText = trig->GetText(); dprintf(D_FULLDEBUG, "Parsing trigger text '%s'\n", triggerText.c_str()); prev_pos = pos = 0; while (prev_pos < triggerText.length()) { pos = triggerText.find("$(", prev_pos, 2); if (std::string::npos == pos) { // Didn't find the start of a varible, so append the // remaining string dprintf(D_FULLDEBUG, "Adding text string to event text\n"); eventText += triggerText.substr(prev_pos, std::string::npos); prev_pos = triggerText.length(); } else { // Found a variable for substitution. Need to add // text before it to the string, grab the variable // to substitute for, and put its value in the text eventText += triggerText.substr(prev_pos, pos - prev_pos); dprintf(D_FULLDEBUG, "Adding text string prior to variable substitution to event text\n"); // Increment the position by 2 to skip the $( prev_pos = pos + 2; pos = triggerText.find(")", prev_pos, 1); if (std::string::npos == pos) { // Uh-oh. We have a start of a variable substitution // but no closing marker. dprintf(D_FULLDEBUG, "Error: Failed to find closing varable substitution marker ')'. Aborting processing of the trigger\n"); bad_trigger = true; break; } else { token_str = triggerText.substr(prev_pos, pos-prev_pos).c_str(); token = RemoveWS(token_str); dprintf(D_FULLDEBUG, "token: '%s'\n", token); if (NULL == token) { dprintf(D_ALWAYS, "Removing whitespace from %s produced unusable name. Aborting processing of the trigger\n", token_str); bad_trigger = true; break; } attr = ad->LookupExpr(token); if (NULL == attr) { // The token isn't found in the classad, so treat it // like a string dprintf(D_FULLDEBUG, "Adding text string to event text\n"); eventText += token; } else { dprintf(D_FULLDEBUG, "Adding classad value to event text\n"); eventText += ExprTreeToString(attr); } if (NULL != token) { free(token); token = NULL; } ++pos; } prev_pos = pos; } } // Remove the trailing space std::string::size_type notwhite = eventText.find_last_not_of(" "); eventText.erase(notwhite+1); // Send the event if (false == bad_trigger) { EventCondorTriggerNotify event(eventText, time(NULL)); singleton->getInstance()->raiseEvent(event); dprintf(D_FULLDEBUG, "Triggerd: Raised event with text '%s'\n", eventText.c_str()); } result.Delete(ad); } bad_trigger = false; result.Close(); } } delete query; } else { dprintf(D_FULLDEBUG, "Triggerd: No triggers to evaluate\n"); } // Look for absent nodes (nodes expected to be in the pool but aren't) if (NULL != console) { missing_nodes = console->findAbsentNodes(); if (0 < missing_nodes.size()) { for (std::list<std::string>::iterator node = missing_nodes.begin(); node != missing_nodes.end(); ++ node) { eventText = node->c_str(); eventText += " is missing from the pool"; EventCondorTriggerNotify event(eventText, time(NULL)); singleton->getInstance()->raiseEvent(event); dprintf(D_FULLDEBUG, "Triggerd: Raised event with text '%s'\n", eventText.c_str()); } } } return ret_val; }
// This handler is called when a client wishes to write files from the // transferd's storage. int TransferD::write_files_handler(int cmd, Stream *sock) { ReliSock *rsock = (ReliSock*)sock; MyString capability; int protocol = FTP_UNKNOWN; TransferRequest *treq = NULL; MyString fquser; static int transfer_reaper_id = -1; ThreadArg *thread_arg; int tid; ClassAd reqad; ClassAd respad; cmd = cmd; // quiet the compiler. dprintf(D_ALWAYS, "Got TRANSFERD_WRITE_FILES!\n"); ///////////////////////////////////////////////////////////////////////// // make sure we are authenticated ///////////////////////////////////////////////////////////////////////// if( ! rsock->triedAuthentication() ) { CondorError errstack; if( ! SecMan::authenticate_sock(rsock, WRITE, &errstack) ) { // we failed to authenticate, we should bail out now // since we don't know what user is trying to perform // this action. // TODO: it'd be nice to print out what failed, but we // need better error propagation for that... errstack.push( "TransferD::setup_transfer_request_handler()", 42, "Failure to register transferd - Authentication failed" ); dprintf( D_ALWAYS, "setup_transfer_request_handler() " "aborting: %s\n", errstack.getFullText() ); refuse( rsock ); return CLOSE_STREAM; } } fquser = rsock->getFullyQualifiedUser(); ///////////////////////////////////////////////////////////////////////// // Check to see if the capability the client tells us is something that // we have knowledge of. We ONLY check the capability and not the // identity of the person in question. This allows people of different // identities to write files here as long as they had the right // capability. While this might not sound secure, they STILL had to have // authenticated as someone this daemon trusts. // Similarly, check the protocol it wants to use as well as ensure that // the direction the transfer request was supposed to be is being honored. ///////////////////////////////////////////////////////////////////////// rsock->decode(); // soak the request ad from the client about what it wants to transfer reqad.initFromStream(*rsock); rsock->end_of_message(); reqad.LookupString(ATTR_TREQ_CAPABILITY, capability); rsock->encode(); // do I know of such a capability? if (m_treqs.lookup(capability, treq) != 0) { // didn't find it. Log it and tell them to leave and close up shop respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE); respad.Assign(ATTR_TREQ_INVALID_REASON, "Invalid capability!"); respad.put(*rsock); rsock->end_of_message(); dprintf(D_ALWAYS, "Client identity '%s' tried to write some files " "using capability '%s', but there was no such capability. " "Access denied.\n", fquser.Value(), capability.Value()); return CLOSE_STREAM; } reqad.LookupInteger(ATTR_TREQ_FTP, protocol); // am I willing to use this protocol? switch(protocol) { case FTP_CFTP: // FileTrans protocol, I'm happy. break; default: respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE); respad.Assign(ATTR_TREQ_INVALID_REASON, "Invalid file transfer protocol!"); respad.put(*rsock); rsock->end_of_message(); dprintf(D_ALWAYS, "Client identity '%s' tried to write some files " "using protocol '%d', but I don't support that protocol. " "Access denied.\n", fquser.Value(), protocol); return CLOSE_STREAM; } // nsure that this transfer request was of the uploading variety if (treq->get_direction() != FTPD_UPLOAD) { respad.Assign(ATTR_TREQ_INVALID_REQUEST, TRUE); respad.Assign(ATTR_TREQ_INVALID_REASON, "Transfer Request was not an uploading request!"); respad.put(*rsock); rsock->end_of_message(); dprintf(D_ALWAYS, "Client identity '%s' tried to write some files " "to a transfer request that wasn't expecting to be written. " "Access denied.\n", fquser.Value()); } ///////////////////////////////////////////////////////////////////////// // Tell the client everything was ok. ///////////////////////////////////////////////////////////////////////// respad.Assign(ATTR_TREQ_INVALID_REQUEST, FALSE); respad.put(*rsock); rsock->end_of_message(); ///////////////////////////////////////////////////////////////////////// // Set up a thread (a process under unix) to read ALL of the job files // for all of the ads in the TransferRequest. ///////////////////////////////////////////////////////////////////////// // now create a thread, passing in the sock, which uses the file transfer // object to accept the files. if (transfer_reaper_id == -1) { // only set this up ONCE so each and every thread gets one. transfer_reaper_id = daemonCore->Register_Reaper( "write_files_reaper", (ReaperHandlercpp) &TransferD::write_files_reaper, "write_files_reaper", this ); } thread_arg = new ThreadArg(protocol, treq); // Start a new thread (process on Unix) to do the work tid = daemonCore->Create_Thread( (ThreadStartFunc)&TransferD::write_files_thread, (void *)thread_arg, rsock, transfer_reaper_id ); if (tid == FALSE) { // XXX How do I handle this failure? } // associate the tid with the request so I can deal with it propery in // the reaper m_client_to_transferd_threads.insert(tid, treq); // The stream is inherited to the thread, who does the transfer and // finishes the protocol, but in the parent, I'm closing it. return CLOSE_STREAM; }
void procArg(const char* arg) { int c, p; // cluster/proc # char* tmp; MyString constraint; if( str_isint(arg) || str_isreal(arg,true) ) // process by cluster/proc # { c = strtol(arg, &tmp, 10); if(c <= 0) { fprintf(stderr, "Invalid cluster # from %s.\n", arg); had_error = true; return; } if(*tmp == '\0') // delete the cluster { CondorError errstack; constraint.formatstr( "%s == %d", ATTR_CLUSTER_ID, c ); if( doWorkByConstraint(constraint.Value(), &errstack) ) { fprintf( stdout, "Cluster %d %s.\n", c, (mode == JA_REMOVE_JOBS) ? "has been marked for removal" : (mode == JA_REMOVE_X_JOBS) ? "has been removed locally (remote state unknown)" : actionWord(mode,true) ); } else { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if (had_error) { fprintf( stderr, "Couldn't find/%s all jobs in cluster %d.\n", actionWord(mode,false), c ); } } return; } if(*tmp == '.') { p = strtol(tmp + 1, &tmp, 10); if(p < 0) { fprintf( stderr, "Invalid proc # from %s.\n", arg); had_error = true; return; } if(*tmp == '\0') // process a proc { if( ! job_ids ) { job_ids = new StringList(); } job_ids->append( arg ); return; } } fprintf( stderr, "Warning: unrecognized \"%s\" skipped.\n", arg ); return; } // process by user name else { CondorError errstack; constraint.formatstr("%s == \"%s\"", ATTR_OWNER, arg ); if( doWorkByConstraint(constraint.Value(), &errstack) ) { fprintf( stdout, "User %s's job(s) %s.\n", arg, (mode == JA_REMOVE_JOBS) ? "have been marked for removal" : (mode == JA_REMOVE_X_JOBS) ? "have been removed locally (remote state unknown)" : actionWord(mode,true) ); } else { fprintf( stderr, "%s\n", errstack.getFullText(true).c_str() ); if (had_error) { fprintf( stderr, "Couldn't find/%s all of user %s's job(s).\n", actionWord(mode,false), arg ); } } } }
int main(int argc, char **argv) { char ** ptr; const char * myName; // find our name myName = strrchr( argv[0], DIR_DELIM_CHAR ); if( !myName ) { myName = argv[0]; } else { myName++; } int cred_type = 0; char * cred_name = NULL; char * cred_file_name = NULL; char * myproxy_user = NULL; char * myproxy_host = NULL; int myproxy_port = 0; char * myproxy_dn = NULL; char * server_address= NULL; // read config file myDistro->Init (argc, argv); config(); for (ptr=argv+1,argc--; argc > 0; argc--,ptr++) { if ( ptr[0][0] == '-' ) { switch ( ptr[0][1] ) { case 'h': usage(myName); exit(0); break; case 'd': // dprintf to console Termlog = 1; dprintf_config ("TOOL", get_param_functions()); break; case 'S': // dprintf to console Termlog = 1; Read_Myproxy_pw_terminal = false; break; case 'n': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -n requires another argument\n", myName ); exit(1); } server_address = strdup (*ptr); break; case 't': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -t requires another argument\n", myName ); exit(1); } if (strcmp (*ptr, "x509") == 0) { cred_type = X509_CREDENTIAL_TYPE; } else { fprintf( stderr, "Invalid credential type %s\n", *ptr ); exit(1); } break; case 'f': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -f requires another argument\n", myName ); exit(1); } cred_file_name = strdup (*ptr); break; case 'N': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -N requires another argument\n", myName ); exit(1); } cred_name = strdup (*ptr); break; case 'm': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -m requires another argument\n", myName ); exit(1); } parseMyProxyArgument (*ptr, myproxy_user, myproxy_host, myproxy_port); break; case 'D': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -D requires another argument\n", myName ); exit(1); } myproxy_dn = strdup (*ptr); break; case 'v': version(); // this function calls exit(0) break; default: fprintf( stderr, "%s: Unknown option %s\n", myName, *ptr); usage(myName); exit(1); } } //fi } //rof if (( cred_file_name == NULL ) || (cred_type == 0)) { fprintf ( stderr, "Credential filename or type not specified\n"); exit (1); } Credential * cred = NULL; if (cred_type == X509_CREDENTIAL_TYPE) { cred = new X509Credential(); } else { fprintf ( stderr, "Invalid credential type\n"); exit (1); } char * data = NULL; int data_size; if (!read_file (cred_file_name, data, data_size)) { fprintf (stderr, "Can't open %s\n", cred_file_name); exit (1); } cred->SetData (data, data_size); if (cred_name !=NULL) { cred->SetName(cred_name); } else { cred->SetName(DEFAULT_CREDENTIAL_NAME); } char * username = my_username(0); cred->SetOwner (username); if (cred_type == X509_CREDENTIAL_TYPE && myproxy_host != NULL) { X509Credential * x509cred = (X509Credential*)cred; MyString str_host_port = myproxy_host; if (myproxy_port != 0) { str_host_port += ":"; str_host_port += myproxy_port; } x509cred->SetMyProxyServerHost (str_host_port.Value()); if (myproxy_user != NULL) { x509cred->SetMyProxyUser (myproxy_user); } else { x509cred->SetMyProxyUser (username); } if (myproxy_dn != NULL) { x509cred->SetMyProxyServerDN (myproxy_dn); } char * myproxy_password; if ( Read_Myproxy_pw_terminal ) { myproxy_password = prompt_password( "Please enter the MyProxy password:"******"Please enter the MyProxy password from the standard input\n"); } if (myproxy_password) { x509cred->SetRefreshPassword ( myproxy_password ); } x509cred->display( D_FULLDEBUG ); } CondorError errstack; DCCredd dc_credd (server_address); // resolve server address if ( ! dc_credd.locate() ) { fprintf (stderr, "%s\n", dc_credd.error() ); return 1; } if (dc_credd.storeCredential(cred, errstack)) { printf ("Credential submitted successfully\n"); } else { fprintf (stderr, "Unable to submit credential\n%s\n", errstack.getFullText(true)); return 1; } return 0; }
/** * Process the current history file. * * 1) check to see if it is properly initialized, recording id (inode) * 2) stat the current history file * 3) poll for new entries and process them * 4) detect rotations */ void aviary::history::processCurrentHistory() { static MyString currentHistoryFilename = m_path + DIR_DELIM_STRING + "history"; static HistoryFile currentHistory ( currentHistoryFilename.Value() ); CondorError errstack; if (force_reset) { currentHistory.cleanup(); } // (1) long unsigned int id; if ( !currentHistory.getId ( id ) || force_reset) { // at this point adjust the reset flag force_reset = false; if ( !currentHistory.init ( errstack ) ) { dprintf ( D_ALWAYS, "%s\n", errstack.getFullText().c_str() ); return; } ASSERT ( currentHistory.getId ( id ) ); m_historyFiles.insert ( id ); } // (2) // Stat before poll to handle race of: poll + write + rotate + stat StatWrapper stat_wrapper; if ( stat_wrapper.Stat ( currentHistoryFilename ) ) { dprintf ( D_ALWAYS, "Failed to stat %s: %d (%s)\n", currentHistoryFilename.Value(), stat_wrapper.GetErrno(), strerror ( stat_wrapper.GetErrno() ) ); return; } const StatStructType *stat = stat_wrapper.GetBuf(); ASSERT ( currentHistory.getId ( id ) ); // (3) errstack.clear(); HistoryFile::HistoryEntriesTypeIterators poll = currentHistory.poll ( errstack ); for ( HistoryFile::HistoryEntriesTypeIterator i = poll.first; i != poll.second; i++ ) { process ( ( *i ) ); } // (4) // If different the file has rotated if ( id != stat->st_ino ) { currentHistory = HistoryFile ( currentHistoryFilename.Value() ); if ( !currentHistory.init ( errstack ) ) { dprintf ( D_ALWAYS, "%s\n", errstack.getFullText().c_str() ); return; } ASSERT ( currentHistory.getId ( id ) ); m_historyFiles.insert ( id ); force_reset = true; return; } }
int main(int argc, char **argv) { char * server_address = NULL; char ** ptr; const char * myName; // find our name myName = strrchr( argv[0], DIR_DELIM_CHAR ); if( !myName ) { myName = argv[0]; } else { myName++; } // read config file myDistro->Init (argc, argv); config (); for (ptr=argv+1,argc--; argc > 0; argc--,ptr++) { if ( ptr[0][0] == '-' ) { switch ( ptr[0][1] ) { case 'h': usage(myName); exit(0); break; case 'd': // dprintf to console Termlog = 1; dprintf_config ("TOOL", get_param_functions()); break; case 'n': if( !(--argc) || !(*(++ptr)) ) { fprintf( stderr, "%s: -n requires another argument\n", myName ); exit(1); } server_address = strdup (*ptr); break; case 'v': version(); // this function calls exit(0) break; default: fprintf( stderr, "%s: Unknown option %s\n", myName, *ptr); usage(myName); exit(1); } } //fi } //rof CondorError errorstack; int number = 0; SimpleList <Credential*> result; DCCredd credd(server_address); // resolve server address if ( ! credd.locate() ) { fprintf (stderr, "%s\n", credd.error() ); return 1; } if (!credd.listCredentials (result, number, errorstack)) { fprintf (stderr, "Unable to retrieve credentials (%s)\n", errorstack.getFullText(true)); return 1; } if (number > 0) { Credential * cred; result.Rewind(); printf ("Name\tType\n-----\t-----\n"); while (result.Next (cred)) { printf ("%s\t%s\n", cred->GetName(), cred->GetTypeString()); } printf ("\nTotal %d\n", number); } else if (number == 0) { printf ("No credentials currently stored on this server\n"); } else { fprintf (stderr, "ERROR\n"); return 1; } return 0; }
//--------------------------------------------------------------------------- void main_init (int argc, char ** const argv) { printf ("Executing condor dagman ... \n"); // flag used if DAGMan is invoked with -WaitForDebug so we // wait for a developer to attach with a debugger... volatile int wait_for_debug = 0; // process any config vars -- this happens before we process // argv[], since arguments should override config settings dagman.Config(); // The DCpermission (last parm) should probably be PARENT, if it existed daemonCore->Register_Signal( SIGUSR1, "SIGUSR1", (SignalHandler) main_shutdown_remove, "main_shutdown_remove", NULL); /****** FOR TESTING ******* daemonCore->Register_Signal( SIGUSR2, "SIGUSR2", (SignalHandler) main_testing_stub, "main_testing_stub", NULL); ****** FOR TESTING ********/ debug_progname = condor_basename(argv[0]); // condor_submit_dag version from .condor.sub bool allowVerMismatch = false; const char *csdVersion = "undefined"; int i; for (i = 0 ; i < argc ; i++) { debug_printf( DEBUG_NORMAL, "argv[%d] == \"%s\"\n", i, argv[i] ); } if (argc < 2) Usage(); // Make sure an input file was specified // get dagman job id from environment, if it's there // (otherwise it will be set to "-1.-1.-1") dagman.DAGManJobId.SetFromString( getenv( EnvGetName( ENV_ID ) ) ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Minimum legal version for a .condor.sub file to be compatible // with this condor_dagman binary. // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // Be sure to change this if the arguments or environment // passed to condor_dagman change in an incompatible way!! // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! struct DagVersionData { int majorVer; int minorVer; int subMinorVer; }; const DagVersionData MIN_SUBMIT_FILE_VERSION = { 7, 1, 2 }; // Construct a string of the minimum submit file version. MyString minSubmitVersionStr; minSubmitVersionStr.formatstr( "%d.%d.%d", MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ); //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // // Process command-line arguments // for (i = 1; i < argc; i++) { if( !strcasecmp( "-Debug", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No debug level specified\n" ); Usage(); } debug_level = (debug_level_t) atoi (argv[i]); } else if( !strcasecmp( "-Lockfile", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DagMan lockfile specified\n" ); Usage(); } lockFileName = argv[i]; } else if( !strcasecmp( "-Help", argv[i] ) ) { Usage(); } else if (!strcasecmp( "-Dag", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No DAG specified\n" ); Usage(); } dagman.dagFiles.append( argv[i] ); } else if( !strcasecmp( "-MaxIdle", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxIdle\n" ); Usage(); } dagman.maxIdle = atoi( argv[i] ); } else if( !strcasecmp( "-MaxJobs", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxJobs\n" ); Usage(); } dagman.maxJobs = atoi( argv[i] ); } else if( !strcasecmp( "-MaxScripts", argv[i] ) ) { debug_printf( DEBUG_SILENT, "-MaxScripts has been replaced with " "-MaxPre and -MaxPost arguments\n" ); Usage(); } else if( !strcasecmp( "-MaxPre", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPre\n" ); Usage(); } dagman.maxPreScripts = atoi( argv[i] ); } else if( !strcasecmp( "-MaxPost", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "Integer missing after -MaxPost\n" ); Usage(); } dagman.maxPostScripts = atoi( argv[i] ); } else if( !strcasecmp( "-NoEventChecks", argv[i] ) ) { debug_printf( DEBUG_QUIET, "Warning: -NoEventChecks is " "ignored; please use the DAGMAN_ALLOW_EVENTS " "config parameter instead\n"); check_warning_strictness( DAG_STRICT_1 ); } else if( !strcasecmp( "-AllowLogError", argv[i] ) ) { dagman.allowLogError = true; } else if( !strcasecmp( "-DontAlwaysRunPost",argv[i] ) ) { dagman._runPost = false; } else if( !strcasecmp( "-WaitForDebug", argv[i] ) ) { wait_for_debug = 1; } else if( !strcasecmp( "-UseDagDir", argv[i] ) ) { dagman.useDagDir = true; } else if( !strcasecmp( "-AutoRescue", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No AutoRescue value specified\n" ); Usage(); } dagman.autoRescue = (atoi( argv[i] ) != 0); } else if( !strcasecmp( "-DoRescueFrom", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No rescue DAG number specified\n" ); Usage(); } dagman.doRescueFrom = atoi (argv[i]); } else if( !strcasecmp( "-CsdVersion", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No CsdVersion value specified\n" ); Usage(); } csdVersion = argv[i]; } else if( !strcasecmp( "-AllowVersionMismatch", argv[i] ) ) { allowVerMismatch = true; } else if( !strcasecmp( "-DumpRescue", argv[i] ) ) { dagman.dumpRescueDag = true; } else if( !strcasecmp( "-verbose", argv[i] ) ) { dagman._submitDagDeepOpts.bVerbose = true; } else if( !strcasecmp( "-force", argv[i] ) ) { dagman._submitDagDeepOpts.bForce = true; } else if( !strcasecmp( "-notification", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No notification value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strNotification = argv[i]; } else if( !strcasecmp( "-dagman", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No dagman value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strDagmanPath = argv[i]; } else if( !strcasecmp( "-outfile_dir", argv[i] ) ) { i++; if( argc <= i || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_SILENT, "No outfile_dir value specified\n" ); Usage(); } dagman._submitDagDeepOpts.strOutfileDir = argv[i]; } else if( !strcasecmp( "-update_submit", argv[i] ) ) { dagman._submitDagDeepOpts.updateSubmit = true; } else if( !strcasecmp( "-import_env", argv[i] ) ) { dagman._submitDagDeepOpts.importEnv = true; } else if( !strcasecmp( "-priority", argv[i] ) ) { ++i; if( i >= argc || strcmp( argv[i], "" ) == 0 ) { debug_printf( DEBUG_NORMAL, "No priority value specified\n"); Usage(); } dagman._submitDagDeepOpts.priority = atoi(argv[i]); } else if( !strcasecmp( "-dont_use_default_node_log", argv[i] ) ) { dagman._submitDagDeepOpts.always_use_node_log = false; } else { debug_printf( DEBUG_SILENT, "\nUnrecognized argument: %s\n", argv[i] ); Usage(); } } dagman.dagFiles.rewind(); dagman.primaryDagFile = dagman.dagFiles.next(); dagman.multiDags = (dagman.dagFiles.number() > 1); MyString tmpDefaultLog; if ( dagman._defaultNodeLog != NULL ) { tmpDefaultLog = dagman._defaultNodeLog; free( dagman._defaultNodeLog ); } else { tmpDefaultLog = dagman.primaryDagFile + ".nodes.log"; } // Force default log file path to be absolute so it works // with -usedagdir and DIR nodes. CondorError errstack; if ( !MultiLogFiles::makePathAbsolute( tmpDefaultLog, errstack) ) { debug_printf( DEBUG_QUIET, "Unable to convert default log " "file name to absolute path: %s\n", errstack.getFullText().c_str() ); dagman.dag->GetJobstateLog().WriteDagmanFinished( EXIT_ERROR ); DC_Exit( EXIT_ERROR ); } dagman._defaultNodeLog = strdup( tmpDefaultLog.Value() ); debug_printf( DEBUG_NORMAL, "Default node log file is: <%s>\n", dagman._defaultNodeLog); // // Check the arguments // //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Checking for version compatibility between the .condor.sub // file and this condor_dagman binary... // Note: if we're in recovery mode and the submit file version // causes us to quit, we leave any existing node jobs still // running -- may want to change that eventually. wenger 2009-10-13. // Version of the condor_submit_dag that created our submit file. CondorVersionInfo submitFileVersion( csdVersion ); // Version of this condor_dagman binary. CondorVersionInfo dagmanVersion; // Just generate this message fragment in one place. MyString versionMsg; versionMsg.formatstr("the version (%s) of this DAG's Condor submit " "file (created by condor_submit_dag)", csdVersion ); // Make sure version in submit file is valid. if( !submitFileVersion.is_valid() ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is invalid!\n", versionMsg.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is invalid; " "continuing because of -AllowVersionMismatch flag\n", versionMsg.Value() ); } // Make sure .condor.sub file is recent enough. } else if ( submitFileVersion.compare_versions( CondorVersion() ) != 0 ) { if( !submitFileVersion.built_since_version( MIN_SUBMIT_FILE_VERSION.majorVer, MIN_SUBMIT_FILE_VERSION.minorVer, MIN_SUBMIT_FILE_VERSION.subMinorVer ) ) { if ( !allowVerMismatch ) { debug_printf( DEBUG_QUIET, "Error: %s is older than " "oldest permissible version (%s)\n", versionMsg.Value(), minSubmitVersionStr.Value() ); DC_Exit( EXIT_ERROR ); } else { debug_printf( DEBUG_NORMAL, "Warning: %s is older than " "oldest permissible version (%s); continuing " "because of -AllowVersionMismatch flag\n", versionMsg.Value(), minSubmitVersionStr.Value() ); } // Warn if .condor.sub file is a newer version than this binary. } else if (dagmanVersion.compare_versions( csdVersion ) > 0 ) { debug_printf( DEBUG_NORMAL, "Warning: %s is newer than " "condor_dagman version (%s)\n", versionMsg.Value(), CondorVersion() ); check_warning_strictness( DAG_STRICT_3 ); } else { debug_printf( DEBUG_NORMAL, "Note: %s differs from " "condor_dagman version (%s), but the " "difference is permissible\n", versionMsg.Value(), CondorVersion() ); } } //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if( dagman.primaryDagFile == "" ) { debug_printf( DEBUG_SILENT, "No DAG file was specified\n" ); Usage(); } if (lockFileName == NULL) { debug_printf( DEBUG_SILENT, "No DAG lock file was specified\n" ); Usage(); } if( dagman.maxJobs < 0 ) { debug_printf( DEBUG_SILENT, "-MaxJobs must be non-negative\n"); Usage(); } if( dagman.maxPreScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPre must be non-negative\n" ); Usage(); } if( dagman.maxPostScripts < 0 ) { debug_printf( DEBUG_SILENT, "-MaxPost must be non-negative\n" ); Usage(); } if( dagman.doRescueFrom < 0 ) { debug_printf( DEBUG_SILENT, "-DoRescueFrom must be non-negative\n" ); Usage(); } debug_printf( DEBUG_VERBOSE, "DAG Lockfile will be written to %s\n", lockFileName ); if ( dagman.dagFiles.number() == 1 ) { debug_printf( DEBUG_VERBOSE, "DAG Input file is %s\n", dagman.primaryDagFile.Value() ); } else { MyString msg = "DAG Input files are "; dagman.dagFiles.rewind(); const char *dagFile; while ( (dagFile = dagman.dagFiles.next()) != NULL ) { msg += dagFile; msg += " "; } msg += "\n"; debug_printf( DEBUG_VERBOSE, "%s", msg.Value() ); } // if requested, wait for someone to attach with a debugger... while( wait_for_debug ) { } { MyString cwd; if( !condor_getcwd(cwd) ) { cwd = "<null>"; } debug_printf( DEBUG_DEBUG_1, "Current path is %s\n",cwd.Value()); char *temp = my_username(); debug_printf( DEBUG_DEBUG_1, "Current user is %s\n", temp ? temp : "<null>" ); if( temp ) { free( temp ); } } // // Figure out the rescue DAG to run, if any (this is with "new- // style" rescue DAGs). // int rescueDagNum = 0; MyString rescueDagMsg; if ( dagman.doRescueFrom != 0 ) { rescueDagNum = dagman.doRescueFrom; rescueDagMsg.formatstr( "Rescue DAG number %d specified", rescueDagNum ); RenameRescueDagsAfter( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum, dagman.maxRescueDagNum ); } else if ( dagman.autoRescue ) { rescueDagNum = FindLastRescueDagNum( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum ); rescueDagMsg.formatstr( "Found rescue DAG number %d", rescueDagNum ); } // // Fill in values in the deep submit options that we haven't // already set. // dagman._submitDagDeepOpts.bAllowLogError = dagman.allowLogError; dagman._submitDagDeepOpts.useDagDir = dagman.useDagDir; dagman._submitDagDeepOpts.autoRescue = dagman.autoRescue; dagman._submitDagDeepOpts.doRescueFrom = dagman.doRescueFrom; dagman._submitDagDeepOpts.allowVerMismatch = allowVerMismatch; dagman._submitDagDeepOpts.recurse = false; // // Create the DAG // // Note: a bunch of the parameters we pass here duplicate things // in submitDagOpts, but I'm keeping them separate so we don't have to // bother to construct a new SubmitDagOtions object for splices. // wenger 2010-03-25 dagman.dag = new Dag( dagman.dagFiles, dagman.maxJobs, dagman.maxPreScripts, dagman.maxPostScripts, dagman.allowLogError, dagman.useDagDir, dagman.maxIdle, dagman.retrySubmitFirst, dagman.retryNodeFirst, dagman.condorRmExe, dagman.storkRmExe, &dagman.DAGManJobId, dagman.prohibitMultiJobs, dagman.submitDepthFirst, dagman._defaultNodeLog, dagman._generateSubdagSubmits, &dagman._submitDagDeepOpts, false ); /* toplevel dag! */ if( dagman.dag == NULL ) { EXCEPT( "ERROR: out of memory!\n"); } dagman.dag->SetAbortOnScarySubmit( dagman.abortOnScarySubmit ); dagman.dag->SetAllowEvents( dagman.allow_events ); dagman.dag->SetConfigFile( dagman._dagmanConfigFile ); dagman.dag->SetMaxJobHolds( dagman._maxJobHolds ); dagman.dag->SetPostRun(dagman._runPost); if( dagman._submitDagDeepOpts.priority != 0 ) { // From command line dagman.dag->SetDefaultPriority(dagman._submitDagDeepOpts.priority); } else if( dagman._defaultPriority != 0 ) { // From config file dagman.dag->SetDefaultPriority(dagman._defaultPriority); dagman._submitDagDeepOpts.priority = dagman._defaultPriority; } // // Parse the input files. The parse() routine // takes care of adding jobs and dependencies to the DagMan // dagman.mungeNodeNames = (dagman.dagFiles.number() > 1); parseSetDoNameMunge( dagman.mungeNodeNames ); debug_printf( DEBUG_VERBOSE, "Parsing %d dagfiles\n", dagman.dagFiles.number() ); dagman.dagFiles.rewind(); char *dagFile; // Here we make a copy of the dagFiles for iteration purposes. Deep inside // of the parsing, copies of the dagman.dagFile string list happen which // mess up the iteration of this list. StringList sl( dagman.dagFiles ); sl.rewind(); while ( (dagFile = sl.next()) != NULL ) { debug_printf( DEBUG_VERBOSE, "Parsing %s ...\n", dagFile ); if( !parse( dagman.dag, dagFile, dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } if( dagman.dag->GetDefaultPriority() != 0 ) { dagman.dag->SetDefaultPriorities(); // Applies to the nodes of the dag } dagman.dag->GetJobstateLog().WriteDagmanStarted( dagman.DAGManJobId ); if ( rescueDagNum > 0 ) { // Get our Pegasus sequence numbers set correctly. dagman.dag->GetJobstateLog().InitializeRescue(); } // lift the final set of splices into the main dag. dagman.dag->LiftSplices(SELF); // // Actually parse the "new-new" style (partial DAG info only) // rescue DAG here. Note: this *must* be done after splices // are lifted! // if ( rescueDagNum > 0 ) { dagman.rescueFileToRun = RescueDagName( dagman.primaryDagFile.Value(), dagman.multiDags, rescueDagNum ); debug_printf ( DEBUG_QUIET, "%s; running %s in combination with " "normal DAG file%s\n", rescueDagMsg.Value(), dagman.rescueFileToRun.Value(), dagman.multiDags ? "s" : ""); debug_printf ( DEBUG_QUIET, "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"); debug_printf ( DEBUG_QUIET, "USING RESCUE DAG %s\n", dagman.rescueFileToRun.Value() ); // Turn off node name munging for the rescue DAG, because // it will already have munged node names. parseSetDoNameMunge( false ); if( !parse( dagman.dag, dagman.rescueFileToRun.Value(), dagman.useDagDir ) ) { if ( dagman.dumpRescueDag ) { // Dump the rescue DAG so we can see what we got // in the failed parse attempt. debug_printf( DEBUG_QUIET, "Dumping rescue DAG " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, true, false ); } dagman.dag->RemoveRunningJobs(dagman, true); MSC_SUPPRESS_WARNING_FIXME(6031) // return falue of unlink ignored. unlink( lockFileName ); dagman.CleanUp(); // Note: debug_error calls DC_Exit(). debug_error( 1, DEBUG_QUIET, "Failed to parse %s\n", dagFile ); } } dagman.dag->CheckThrottleCats(); // fix up any use of $(JOB) in the vars values for any node dagman.dag->ResolveVarsInterpolations(); /* debug_printf(DEBUG_QUIET, "COMPLETED DAG!\n");*/ /* dagman.dag->PrintJobList();*/ #ifndef NOT_DETECT_CYCLE if( dagman.startup_cycle_detect && dagman.dag->isCycle() ) { // Note: maybe we should run the final node here, if there is one. // wenger 2011-12-19. debug_error (1, DEBUG_QUIET, "ERROR: a cycle exists in the dag, please check input\n"); } #endif debug_printf( DEBUG_VERBOSE, "Dag contains %d total jobs\n", dagman.dag->NumNodes( true ) ); MyString firstLocation; if ( dagman.dag->GetReject( firstLocation ) ) { debug_printf( DEBUG_QUIET, "Exiting because of REJECT " "specification in %s. This most likely means " "that the DAG file was produced with the -DumpRescue " "flag when parsing the original DAG failed.\n", firstLocation.Value() ); DC_Exit( EXIT_ERROR ); return; } dagman.dag->DumpDotFile(); if ( dagman.dumpRescueDag ) { debug_printf( DEBUG_QUIET, "Dumping rescue DAG and exiting " "because of -DumpRescue flag\n" ); dagman.dag->Rescue( dagman.primaryDagFile.Value(), dagman.multiDags, dagman.maxRescueDagNum, false, false, false ); ExitSuccess(); return; } //------------------------------------------------------------------------ // Bootstrap and Recovery // // If the Lockfile exists, this indicates a premature termination // of a previous run of Dagman. If condor log is also present, // we run in recovery mode // If the Daglog is not present, then we do not run in recovery // mode { bool recovery = access(lockFileName, F_OK) == 0; if (recovery) { debug_printf( DEBUG_VERBOSE, "Lock file %s detected, \n", lockFileName); if (dagman.abortDuplicates) { if (util_check_lock_file(lockFileName) == 1) { debug_printf( DEBUG_QUIET, "Aborting because it " "looks like another instance of DAGMan is " "currently running on this DAG; if that is " "not the case, delete the lock file (%s) " "and re-submit the DAG.\n", lockFileName ); dagman.dag->GetJobstateLog(). WriteDagmanFinished( EXIT_RESTART ); dagman.CleanUp(); DC_Exit( EXIT_ERROR ); // We should never get to here! } } } // // If this DAGMan continues, it should overwrite the lock // file if it exists. // util_create_lock_file(lockFileName, dagman.abortDuplicates); debug_printf( DEBUG_VERBOSE, "Bootstrapping...\n"); if( !dagman.dag->Bootstrap( recovery ) ) { dagman.dag->PrintReadyQ( DEBUG_DEBUG_1 ); debug_error( 1, DEBUG_QUIET, "ERROR while bootstrapping\n"); } } debug_printf( DEBUG_VERBOSE, "Registering condor_event_timer...\n" ); daemonCore->Register_Timer( 1, dagman.m_user_log_scan_interval, condor_event_timer, "condor_event_timer" ); dagman.dag->SetPendingNodeReportInterval( dagman.pendingReportInterval ); }