// Utility function. void pushStringListBack( std::vector< YourString > & v, StringList & sl ) { const char * text = NULL; sl.rewind(); int count = 0; if( sl.number() > 0 ) { while( (text = sl.next()) ) { v.push_back( text ); ++count; } } ASSERT( count == sl.number() ); v.push_back( NULLSTRING ); }
int Job::submit(const struct condor__ClassAdStruct &jobAd, CondorError &errstack) { int i, rval; // XXX: This is ugly, and only should happen when spooling, // i.e. not always with cedar. rval = SetAttributeString(id.cluster, id.proc, ATTR_JOB_IWD, spoolDirectory.Value()); if (rval < 0) { errstack.pushf("SOAP", FAIL, "Failed to set job %d.%d's %s attribute to '%s'.", id.cluster, id.proc, ATTR_JOB_IWD, spoolDirectory.Value()); return rval; } StringList transferFiles; MyString currentKey; JobFile jobFile; declaredFiles.startIterations(); while (declaredFiles.iterate(currentKey, jobFile)) { transferFiles.append(jobFile.name.Value()); } char *fileList = NULL; if (0 == transferFiles.number()) { fileList = strdup(""); } else { fileList = transferFiles.print_to_string(); ASSERT(fileList); } rval = SetAttributeString(id.cluster, id.proc, ATTR_TRANSFER_INPUT_FILES, fileList); if (fileList) { free(fileList); fileList = NULL; } if (rval < 0) { errstack.pushf("SOAP", FAIL, "Failed to set job %d.%d's %s attribute.", id.cluster, id.proc, ATTR_TRANSFER_INPUT_FILES); return rval; } int found_iwd = 0; for (i = 0; i < jobAd.__size; i++) { const char* name = jobAd.__ptr[i].name; const char* value = jobAd.__ptr[i].value; if (!name) continue; if (!value) value="UNDEFINED"; // XXX: This is a quick fix. If processing MyType or // TargetType they should be ignored. Ideally we could // convert the ClassAdStruct to a ClassAd and then iterate // the ClassAd. if (0 == strcmp(name, ATTR_MY_TYPE) || 0 == strcmp(name, ATTR_TARGET_TYPE)) { continue; } if ( jobAd.__ptr[i].type == STRING_ATTR ) { // string type - put value in quotes as hint for ClassAd parser found_iwd = found_iwd || !strcmp(name, ATTR_JOB_IWD); rval = SetAttributeString(id.cluster, id.proc, name, value); } else { // all other types can be deduced by the ClassAd parser rval = SetAttribute(id.cluster, id.proc, name, value); } if ( rval < 0 ) { errstack.pushf("SOAP", FAIL, "Failed to set job %d.%d's %s attribute.", id.cluster, id.proc, name); return rval; } } // Trust the client knows what it is doing if there is an Iwd. if (!found_iwd) { // We need to make sure the Iwd is rewritten so files // in the spool directory can be found. rval = SetAttributeString(id.cluster, id.proc, ATTR_JOB_IWD, spoolDirectory.Value()); if (rval < 0) { errstack.pushf("SOAP", FAIL, "Failed to set %d.%d's %s attribute to '%s'.", id.cluster, id.proc, ATTR_JOB_IWD, spoolDirectory.Value()); return rval; } } return 0; }
//--------------------------------------------------------------------------- bool Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader, ReadMultipleUserLogs &storkLogReader, bool nfsIsError, bool recovery, const char *defaultNodeLog, bool usingDefault ) { debug_printf( DEBUG_DEBUG_2, "Attempting to monitor log file for node %s\n", GetJobName() ); if ( _logIsMonitored ) { debug_printf( DEBUG_DEBUG_1, "Warning: log file for node " "%s is already monitored\n", GetJobName() ); return true; } ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ? condorLogReader : storkLogReader; std::string logFileStr; if ( _jobType == TYPE_CONDOR ) { // We check to see if the user has specified a log file // If not, we give him a default MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile, _directory, _logFileIsXml, usingDefault); logFileStr = templogFileStr.Value(); } else { StringList logFiles; MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile( _cmdFile, _directory, logFiles ); if ( tmpResult != "" ) { debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n", tmpResult.Value() ); LogMonitorFailed(); return false; } else if ( logFiles.number() != 1 ) { debug_printf( DEBUG_QUIET, "Error: %d Stork log files found " "in submit file %s; we want 1\n", logFiles.number(), _cmdFile ); LogMonitorFailed(); return false; } else { logFiles.rewind(); logFileStr = logFiles.next(); } } // Warn the user if the node's log file is in /tmp. if ( logFileStr.find( "/tmp" ) == 0 ) { debug_printf( DEBUG_QUIET, "Warning: " "Log file %s for node %s is in /tmp\n", logFileStr.c_str(), GetJobName() ); check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 ); } if ( logFileStr == "" ) { logFileStr = defaultNodeLog; _useDefaultLog = true; // Default User log is never XML // This could be specified in the submit file and should be // ignored. _logFileIsXml = false; debug_printf( DEBUG_NORMAL, "Unable to get log file from " "submit file %s (node %s); using default (%s)\n", _cmdFile, GetJobName(), logFileStr.c_str() ); append_default_log = false; } else { append_default_log = usingDefault; if( append_default_log ) { // DAGman is not going to look at the user-specified log. // It will look at the defaultNode log. logFileStr = defaultNodeLog; _useDefaultLog = false; _logFileIsXml = false; } } // This function returns true if the log file is on NFS and // that is an error. If the log file is on NFS, but nfsIsError // is false, it prints a warning but returns false. if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(), nfsIsError ) ) { debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n", logFileStr.c_str() ); LogMonitorFailed(); return false; } delete [] _logFile; // Saving log file here in case submit file gets changed. _logFile = strnewp( logFileStr.c_str() ); debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n", GetLogFile(), GetJobName() ); CondorError errstack; if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) { errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE, "ERROR: Unable to monitor log file for node %s", GetJobName() ); debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() ); LogMonitorFailed(); EXCEPT( "Fatal log file monitoring error!\n" ); return false; } _logIsMonitored = true; return true; }
int stdin_pipe_handler(Service*, int) { std::string* line; while ((line = stdin_buffer.GetNextLine()) != NULL) { const char * command = line->c_str(); // CREATE_CONDOR_SECURITY_SESSION contains sensitive data that // normally shouldn't be written to a publically-readable log. // We should conceal it unless GAHP_DEBUG_HIDE_SENSITIVE_DATA // says not to. if ( param_boolean( "GAHP_DEBUG_HIDE_SENSITIVE_DATA", true ) && strncmp( command, GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION, strlen( GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION ) ) == 0 ) { dprintf( D_ALWAYS, "got stdin: %s XXXXXXXX\n", GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION ); } else { dprintf (D_ALWAYS, "got stdin: %s\n", command); } Gahp_Args args; if (parse_gahp_command (command, &args) && verify_gahp_command (args.argv, args.argc)) { // Catch "special commands first if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) { // Print number of results std::string rn_buff; formatstr( rn_buff, "%d", result_list.number() ); const char * commands [] = { GAHP_RESULT_SUCCESS, rn_buff.c_str() }; gahp_output_return (commands, 2); // Print each result line char * next; result_list.rewind(); while ((next = result_list.next()) != NULL) { printf ("%s\n", next); fflush(stdout); dprintf(D_FULLDEBUG,"put stdout: %s\n",next); result_list.deleteCurrent(); } new_results_signaled = FALSE; } else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) { printf ("S %s\n", version); fflush (stdout); dprintf(D_FULLDEBUG,"put stdout: S %s\n",version); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) { gahp_output_return_success(); DC_Exit(0); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) { async_mode = TRUE; new_results_signaled = FALSE; gahp_output_return_success(); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) { async_mode = FALSE; gahp_output_return_success(); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) { gahp_output_return_success(); return 0; // exit } else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) { const char * commands [] = { GAHP_RESULT_SUCCESS, GAHP_COMMAND_DOWNLOAD_SANDBOX, GAHP_COMMAND_UPLOAD_SANDBOX, GAHP_COMMAND_DESTROY_SANDBOX, GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION, GAHP_COMMAND_CONDOR_VERSION, GAHP_COMMAND_ASYNC_MODE_ON, GAHP_COMMAND_ASYNC_MODE_OFF, GAHP_COMMAND_RESULTS, GAHP_COMMAND_QUIT, GAHP_COMMAND_VERSION, GAHP_COMMAND_COMMANDS}; gahp_output_return (commands, 12); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION) == 0) { ClaimIdParser claimid( args.argv[1] ); if ( !daemonCore->getSecMan()->CreateNonNegotiatedSecuritySession( DAEMON, claimid.secSessionId(), claimid.secSessionKey(), claimid.secSessionInfo(), CONDOR_PARENT_FQU, NULL, 0 ) ) { gahp_output_return_error(); } else { sec_session_id = claimid.secSessionId(); gahp_output_return_success(); } } else if (strcasecmp (args.argv[0], GAHP_COMMAND_CONDOR_VERSION) == 0) { peer_condor_version = args.argv[1]; const char *reply [] = { GAHP_RESULT_SUCCESS, escapeGahpString( CondorVersion() ) }; gahp_output_return( reply, 2 ); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); const char * res[2] = { "Worker thread failed", "NULL" }; enqueue_result(args.argv[1], res, 2); close( fds[0] ); } } else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); const char * res[1] = { "Worker thread failed" }; enqueue_result(args.argv[1], res, 1); close( fds[0] ); } } else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); const char * res[1] = { "Worker thread failed" }; enqueue_result(args.argv[1], res, 1); close( fds[0] ); } } else { // should never get here if verify does its job dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]); gahp_output_return_error(); } } else { gahp_output_return_error(); } delete line; } // check if GetNextLine() returned NULL because of an error or EOF if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) { dprintf (D_ALWAYS, "stdin buffer closed, exiting\n"); DC_Exit (1); } return TRUE; }
int stdin_pipe_handler(Service*, int) { std::string* line; while ((line = stdin_buffer.GetNextLine()) != NULL) { const char * command = line->c_str(); dprintf (D_ALWAYS, "got stdin: %s\n", command); Gahp_Args args; if (parse_gahp_command (command, &args) && verify_gahp_command (args.argv, args.argc)) { // Catch "special commands first if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) { // Print number of results std::string rn_buff; formatstr( rn_buff, "%d", result_list.number() ); const char * commands [] = { GAHP_RESULT_SUCCESS, rn_buff.c_str() }; gahp_output_return (commands, 2); // Print each result line char * next; result_list.rewind(); while ((next = result_list.next()) != NULL) { printf ("%s\n", next); fflush(stdout); dprintf(D_FULLDEBUG,"put stdout: %s\n",next); result_list.deleteCurrent(); } new_results_signaled = FALSE; } else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) { printf ("S %s\n", version); fflush (stdout); dprintf(D_FULLDEBUG,"put stdout: S %s\n",version); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) { gahp_output_return_success(); DC_Exit(0); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) { async_mode = TRUE; new_results_signaled = FALSE; gahp_output_return_success(); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) { async_mode = FALSE; gahp_output_return_success(); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) { gahp_output_return_success(); return 0; // exit } else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) { const char * commands [] = { GAHP_RESULT_SUCCESS, GAHP_COMMAND_DOWNLOAD_SANDBOX, GAHP_COMMAND_UPLOAD_SANDBOX, GAHP_COMMAND_DESTROY_SANDBOX, GAHP_COMMAND_ASYNC_MODE_ON, GAHP_COMMAND_ASYNC_MODE_OFF, GAHP_COMMAND_RESULTS, GAHP_COMMAND_QUIT, GAHP_COMMAND_VERSION, GAHP_COMMAND_COMMANDS}; gahp_output_return (commands, 10); } else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!\n" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); close( fds[0] ); } } else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!\n" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); close( fds[0] ); } } else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) { int fds[2]; if ( pipe( fds ) < 0 ) { EXCEPT( "Failed to create pipe!\n" ); } ChildErrorPipe = fds[1]; int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id); close( fds[1] ); if( tid ) { dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid); // this is a "success" in the sense that the gahp command was // well-formatted. whether or not the file transfer works or // not is not what we are reporting here. gahp_output_return_success(); SandboxEnt e; e.pid = tid; e.request_id = args.argv[1]; e.sandbox_id = args.argv[2]; e.error_pipe = fds[0]; // transfer started, record the entry in the map std::pair<int, struct SandboxEnt> p(tid, e); sandbox_map.insert(p); } else { dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n"); gahp_output_return_success(); close( fds[0] ); } } else { // should never get here if verify does its job dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]); gahp_output_return_error(); } } else { gahp_output_return_error(); } delete line; } // check if GetNextLine() returned NULL because of an error or EOF if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) { dprintf (D_ALWAYS, "stdin buffer closed, exiting\n"); DC_Exit (1); } return TRUE; }
// clears all the inner structures and loads the configuration parameters' // values again void ReplicatorStateMachine::reinitialize() { // delete all configurations and start everything over from the scratch finalize( ); AbstractReplicatorStateMachine::reinitialize( ); m_myVersion.initialize( m_stateFilePath, m_versionFilePath ); m_replicationInterval = param_integer("REPLICATION_INTERVAL", 5 * MINUTE, 0); // min value, must be positive // deduce HAD alive tolerance int hadConnectionTimeout = param_integer("HAD_CONNECTION_TIMEOUT", DEFAULT_SEND_COMMAND_TIMEOUT, 0); // min value, must be positive m_maxTransfererLifeTime = param_integer("MAX_TRANSFER_LIFETIME", 5 * MINUTE, 0); // min value, must be positive m_newlyJoinedWaitingVersionInterval = param_integer("NEWLY_JOINED_WAITING_VERSION_INTERVAL", NEWLY_JOINED_TOLERANCE_FACTOR * (hadConnectionTimeout + 1), 0); // min value, must be positive char* buffer = param( "HAD_LIST" ); if ( buffer ) { StringList hadList; hadList.initializeFromString( buffer ); free( buffer ); m_hadAliveTolerance = HAD_ALIVE_TOLERANCE_FACTOR * ( 2 * hadConnectionTimeout * hadList.number() + 1 ); dprintf( D_FULLDEBUG, "ReplicatorStateMachine::reinitialize %s=%d\n", "HAD_LIST", m_hadAliveTolerance ); } else { utilCrucialError( utilNoParameterError( "HAD_LIST", "HAD" ).Value( )); } initializeClassAd(); int updateInterval = param_integer ( "REPLICATION_UPDATE_INTERVAL", 300 ); if ( m_updateInterval != updateInterval ) { m_updateInterval = updateInterval; utilCancelTimer(m_updateCollectorTimerId); m_updateCollectorTimerId = daemonCore->Register_Timer ( 0, m_updateInterval, (TimerHandlercpp) &ReplicatorStateMachine::updateCollectors, "ReplicatorStateMachine::updateCollectors", this ); } // set a timer to replication routine dprintf( D_ALWAYS, "ReplicatorStateMachine::reinitialize setting " "replication timer\n" ); m_replicationTimerId = daemonCore->Register_Timer( m_replicationInterval, (TimerHandlercpp) &ReplicatorStateMachine::replicationTimer, "Time to replicate file", this ); // register the download/upload reaper for the transferer process if( m_downloadReaperId == -1 ) { m_downloadReaperId = daemonCore->Register_Reaper( "downloadReplicaTransfererReaper", (ReaperHandler)&ReplicatorStateMachine::downloadReplicaTransfererReaper, "downloadReplicaTransfererReaper", this ); } if( m_uploadReaperId == -1 ) { m_uploadReaperId = daemonCore->Register_Reaper( "uploadReplicaTransfererReaper", (ReaperHandler) &ReplicatorStateMachine::uploadReplicaTransfererReaper, "uploadReplicaTransfererReaper", this ); } // for debugging purposes only printDataMembers( ); beforePassiveStateHandler( ); }
EC2Resource::BatchStatusResult EC2Resource::StartBatchStatus() { ASSERT( status_gahp ); // m_checkSpotNext starts out false if( ! m_checkSpotNext ) { StringList returnStatus; std::string errorCode; int rc = status_gahp->ec2_vm_status_all( resourceName, m_public_key_file, m_private_key_file, returnStatus, errorCode ); if( rc == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; } if( rc != 0 ) { std::string errorString = status_gahp->getErrorString(); dprintf( D_ALWAYS, "Error doing batched EC2 status query: %s: %s.\n", errorCode.c_str(), errorString.c_str() ); return BSR_ERROR; } // // We have to let a job know if we can't find a status report for it. // List<EC2Job> myJobs; EC2Job * nextJob = NULL; BaseJob *nextBaseJob = NULL; registeredJobs.Rewind(); while ( (nextBaseJob = registeredJobs.Next()) ) { nextJob = dynamic_cast< EC2Job * >( nextBaseJob ); ASSERT( nextJob ); if ( !nextJob->m_client_token.empty() ) { myJobs.Append( nextJob ); } } returnStatus.rewind(); ASSERT( returnStatus.number() % 6 == 0 ); for( int i = 0; i < returnStatus.number(); i += 6 ) { std::string instanceID = returnStatus.next(); std::string status = returnStatus.next(); std::string clientToken = returnStatus.next(); std::string keyName = returnStatus.next(); std::string stateReasonCode = returnStatus.next(); std::string publicDNSName = returnStatus.next(); // Efficiency suggests we look via the instance ID first, // and then try to look things up via the client token // (or, for GT #3682, via the keypair ID). // We can't use BaseJob::JobsByRemoteId because OpenStack doesn't // include the client token in its status responses, and therefore // we can't always fully reconstruct the remoteJobID used as the key. EC2Job * job = NULL; rc = jobsByInstanceID.lookup( HashKey( instanceID.c_str() ), job ); if( rc == 0 ) { ASSERT( job ); dprintf( D_FULLDEBUG, "Found job object for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() ); job->StatusUpdate( instanceID.c_str(), status.c_str(), stateReasonCode.c_str(), publicDNSName.c_str() ); myJobs.Delete( job ); continue; } // If we got a client token, use that to look up the job. We // don't use the instance ID because we may discover it in // this function. Since we need instance ID -based dispatch // code for OpenStack anyway, we'll just use it, rather than // trying the remoteJobID with the instance ID if we don't // find it using only the client token. if( ! clientToken.empty() && clientToken != "NULL" ) { std::string remoteJobID; formatstr( remoteJobID, "ec2 %s %s", resourceName, clientToken.c_str() ); BaseJob * tmp = NULL; rc = BaseJob::JobsByRemoteId.lookup( HashKey( remoteJobID.c_str() ), tmp ); if( rc == 0 ) { ASSERT( tmp ); EC2Job * job = dynamic_cast< EC2Job * >( tmp ); if( job == NULL ) { EXCEPT( "Found non-EC2Job identified by '%s'.", remoteJobID.c_str() ); } dprintf( D_FULLDEBUG, "Found job object via client token for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() ); job->StatusUpdate( instanceID.c_str(), status.c_str(), stateReasonCode.c_str(), publicDNSName.c_str() ); myJobs.Delete( job ); continue; } } // Some servers (OpenStack, Eucalyptus) silently ignore client // tokens. So we need to use the ssh keypair to find jobs that // were submitted but which we don't have an instance ID for. // // TODO This code should be made more efficient. We can // do something better than a linear scan through all // jobs for each status result. Ideally, we'd parse the // ssh keypair name and if it looks like one we generated, // pluck out the job id. if ( !ClientTokenWorks() && !keyName.empty() && keyName != "NULL" ) { myJobs.Rewind(); while ( ( job = myJobs.Next() ) ) { if ( job->m_key_pair == keyName ) { dprintf( D_FULLDEBUG, "Found job object via ssh keypair for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() ); job->StatusUpdate( instanceID.c_str(), status.c_str(), stateReasonCode.c_str(), publicDNSName.c_str() ); myJobs.Delete( job ); continue; } } } dprintf( D_FULLDEBUG, "Found unknown instance '%s'; skipping.\n", instanceID.c_str() ); continue; } myJobs.Rewind(); while( ( nextJob = myJobs.Next() ) ) { dprintf( D_FULLDEBUG, "Informing job %p it got no status.\n", nextJob ); nextJob->StatusUpdate( NULL, NULL, NULL, NULL ); } // Don't ask for spot results unless we know about a spot job. This // should prevent us from breaking OpenStack. if( spotJobsByRequestID.getNumElements() == 0 ) { m_checkSpotNext = false; return BSR_DONE; } else { m_checkSpotNext = true; } } if( m_checkSpotNext ) { StringList spotReturnStatus; std::string spotErrorCode; int spotRC = status_gahp->ec2_spot_status_all( resourceName, m_public_key_file, m_private_key_file, spotReturnStatus, spotErrorCode ); if( spotRC == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; } if( spotRC != 0 ) { std::string errorString = status_gahp->getErrorString(); dprintf( D_ALWAYS, "Error doing batched EC2 spot status query: %s: %s.\n", spotErrorCode.c_str(), errorString.c_str() ); return BSR_ERROR; } List<EC2Job> mySpotJobs; EC2Job * nextSpotJob = NULL; spotJobsByRequestID.startIterations(); while( spotJobsByRequestID.iterate( nextSpotJob ) ) { mySpotJobs.Append( nextSpotJob ); } spotReturnStatus.rewind(); ASSERT( spotReturnStatus.number() % 5 == 0 ); for( int i = 0; i < spotReturnStatus.number(); i += 5 ) { std::string requestID = spotReturnStatus.next(); std::string state = spotReturnStatus.next(); /* std::string launchGroup = */ spotReturnStatus.next(); /* std::string instanceID = */ spotReturnStatus.next(); std::string statusCode = spotReturnStatus.next(); EC2Job * spotJob = NULL; spotRC = spotJobsByRequestID.lookup( HashKey( requestID.c_str() ), spotJob ); if( spotRC != 0 ) { dprintf( D_FULLDEBUG, "Found unknown spot request '%s'; skipping.\n", requestID.c_str() ); continue; } ASSERT( spotJob ); if( ! statusCode.empty() ) { state = statusCode; } dprintf( D_FULLDEBUG, "Found spot job object for '%s', updating status ('%s').\n", requestID.c_str(), state.c_str() ); spotJob->StatusUpdate( NULL, state.c_str(), NULL, NULL ); mySpotJobs.Delete( spotJob ); } mySpotJobs.Rewind(); while( ( nextSpotJob = mySpotJobs.Next() ) ) { dprintf( D_FULLDEBUG, "Informing spot job %p it got no status.\n", nextSpotJob ); nextSpotJob->StatusUpdate( NULL, NULL, NULL, NULL ); } m_checkSpotNext = false; return BSR_DONE; } // This should never happen (but the compiler hates you). return BSR_ERROR; }