Example #1
0
// Utility function.
void
pushStringListBack( std::vector< YourString > & v, StringList & sl ) {
	const char * text = NULL;

	sl.rewind();
	int count = 0;
	if( sl.number() > 0 ) {
		while( (text = sl.next()) ) {
			v.push_back( text );
			++count;
		}
	}
	ASSERT( count == sl.number() );

	v.push_back( NULLSTRING );
}
Example #2
0
int
Job::submit(const struct condor__ClassAdStruct &jobAd,
			CondorError &errstack)
{
	int i, rval;

		// XXX: This is ugly, and only should happen when spooling,
		// i.e. not always with cedar.
	rval = SetAttributeString(id.cluster,
							  id.proc,
							  ATTR_JOB_IWD,
							  spoolDirectory.Value());
	if (rval < 0) {
		errstack.pushf("SOAP",
					   FAIL,
					   "Failed to set job %d.%d's %s attribute to '%s'.",
					   id.cluster,
					   id.proc,
					   ATTR_JOB_IWD,
					   spoolDirectory.Value());

		return rval;
	}

	StringList transferFiles;
	MyString currentKey;
	JobFile jobFile;
	declaredFiles.startIterations();
	while (declaredFiles.iterate(currentKey, jobFile)) {
		transferFiles.append(jobFile.name.Value());
	}

	char *fileList = NULL;
	if (0 == transferFiles.number()) {
		fileList = strdup("");
	} else {
		fileList = transferFiles.print_to_string();
		ASSERT(fileList);
	}

	rval = SetAttributeString(id.cluster,
							  id.proc,
							  ATTR_TRANSFER_INPUT_FILES,
							  fileList);

	if (fileList) {
		free(fileList);
		fileList = NULL;
	}

	if (rval < 0) {
		errstack.pushf("SOAP",
					   FAIL,
					   "Failed to set job %d.%d's %s attribute.",
					   id.cluster,
					   id.proc,
					   ATTR_TRANSFER_INPUT_FILES);

		return rval;
	}

	int found_iwd = 0;
	for (i = 0; i < jobAd.__size; i++) {
		const char* name = jobAd.__ptr[i].name;
		const char* value = jobAd.__ptr[i].value;
		if (!name) continue;
		if (!value) value="UNDEFINED";

			// XXX: This is a quick fix. If processing MyType or
			// TargetType they should be ignored. Ideally we could
			// convert the ClassAdStruct to a ClassAd and then iterate
			// the ClassAd.
		if (0 == strcmp(name, ATTR_MY_TYPE) ||
			0 == strcmp(name, ATTR_TARGET_TYPE)) {
			continue;
		}

		if ( jobAd.__ptr[i].type == STRING_ATTR ) {
				// string type - put value in quotes as hint for ClassAd parser

			found_iwd = found_iwd || !strcmp(name, ATTR_JOB_IWD);

			rval = SetAttributeString(id.cluster, id.proc, name, value);
		} else {
				// all other types can be deduced by the ClassAd parser
			rval = SetAttribute(id.cluster, id.proc, name, value);
		}
		if ( rval < 0 ) {
		errstack.pushf("SOAP",
					   FAIL,
					   "Failed to set job %d.%d's %s attribute.",
					   id.cluster,
					   id.proc,
					   name);

			return rval;
		}
	}

		// Trust the client knows what it is doing if there is an Iwd.
	if (!found_iwd) {
			// We need to make sure the Iwd is rewritten so files
			// in the spool directory can be found.
		rval = SetAttributeString(id.cluster,
								  id.proc,
								  ATTR_JOB_IWD,
								  spoolDirectory.Value());
		if (rval < 0) {
			errstack.pushf("SOAP",
						   FAIL,
						   "Failed to set %d.%d's %s attribute to '%s'.",
						   id.cluster,
						   id.proc,
						   ATTR_JOB_IWD,
						   spoolDirectory.Value());

			return rval;
		}
	}

	return 0;
}
Example #3
0
//---------------------------------------------------------------------------
bool
Job::MonitorLogFile( ReadMultipleUserLogs &condorLogReader,
			ReadMultipleUserLogs &storkLogReader, bool nfsIsError,
			bool recovery, const char *defaultNodeLog, bool usingDefault )
{
	debug_printf( DEBUG_DEBUG_2,
				"Attempting to monitor log file for node %s\n",
				GetJobName() );

	if ( _logIsMonitored ) {
		debug_printf( DEBUG_DEBUG_1, "Warning: log file for node "
					"%s is already monitored\n", GetJobName() );
		return true;
	}

	ReadMultipleUserLogs &logReader = (_jobType == TYPE_CONDOR) ?
				condorLogReader : storkLogReader;

    std::string logFileStr;
	if ( _jobType == TYPE_CONDOR ) {
			// We check to see if the user has specified a log file
			// If not, we give him a default
    	MyString templogFileStr = MultiLogFiles::loadLogFileNameFromSubFile( _cmdFile,
					_directory, _logFileIsXml, usingDefault);
		logFileStr = templogFileStr.Value();
	} else {
		StringList logFiles;
		MyString tmpResult = MultiLogFiles::loadLogFileNamesFromStorkSubFile(
					_cmdFile, _directory, logFiles );
		if ( tmpResult != "" ) {
			debug_printf( DEBUG_QUIET, "Error getting Stork log file: %s\n",
						tmpResult.Value() );
			LogMonitorFailed();
			return false;
		} else if ( logFiles.number() != 1 ) {
			debug_printf( DEBUG_QUIET, "Error: %d Stork log files found "
						"in submit file %s; we want 1\n",
						logFiles.number(), _cmdFile );
			LogMonitorFailed();
			return false;
		} else {
			logFiles.rewind();
			logFileStr = logFiles.next();
		}
	}

		// Warn the user if the node's log file is in /tmp.
	if ( logFileStr.find( "/tmp" ) == 0 ) {
		debug_printf( DEBUG_QUIET, "Warning: "
					"Log file %s for node %s is in /tmp\n",
					logFileStr.c_str(), GetJobName() );
        check_warning_strictness( usingDefault ? DAG_STRICT_2 : DAG_STRICT_1 );
	}

	if ( logFileStr == "" ) {
		logFileStr = defaultNodeLog;
		_useDefaultLog = true;
			// Default User log is never XML
			// This could be specified in the submit file and should be
			// ignored.
		_logFileIsXml = false;
		debug_printf( DEBUG_NORMAL, "Unable to get log file from "
					"submit file %s (node %s); using default (%s)\n",
					_cmdFile, GetJobName(), logFileStr.c_str() );
		append_default_log = false;
	} else {
		append_default_log = usingDefault;
		if( append_default_log ) {
				// DAGman is not going to look at the user-specified log.
				// It will look at the defaultNode log.
			logFileStr = defaultNodeLog;
			_useDefaultLog = false;
			_logFileIsXml = false;
		}
	}

		// This function returns true if the log file is on NFS and
		// that is an error.  If the log file is on NFS, but nfsIsError
		// is false, it prints a warning but returns false.
	if ( MultiLogFiles::logFileNFSError( logFileStr.c_str(),
				nfsIsError ) ) {
		debug_printf( DEBUG_QUIET, "Error: log file %s on NFS\n",
					logFileStr.c_str() );
		LogMonitorFailed();
		return false;
	}

	delete [] _logFile;
		// Saving log file here in case submit file gets changed.
	_logFile = strnewp( logFileStr.c_str() );
	debug_printf( DEBUG_DEBUG_2, "Monitoring log file <%s> for node %s\n",
				GetLogFile(), GetJobName() );
	CondorError errstack;
	if ( !logReader.monitorLogFile( GetLogFile(), !recovery, errstack ) ) {
		errstack.pushf( "DAGMan::Job", DAGMAN_ERR_LOG_FILE,
					"ERROR: Unable to monitor log file for node %s",
					GetJobName() );
		debug_printf( DEBUG_QUIET, "%s\n", errstack.getFullText().c_str() );
		LogMonitorFailed();
		EXCEPT( "Fatal log file monitoring error!\n" );
		return false;
	}

	_logIsMonitored = true;

	return true;
}
Example #4
0
int
stdin_pipe_handler(Service*, int) {

	std::string* line;
	while ((line = stdin_buffer.GetNextLine()) != NULL) {

		const char * command = line->c_str();

		// CREATE_CONDOR_SECURITY_SESSION contains sensitive data that
		// normally shouldn't be written to a publically-readable log.
		// We should conceal it unless GAHP_DEBUG_HIDE_SENSITIVE_DATA
		// says not to.
		if ( param_boolean( "GAHP_DEBUG_HIDE_SENSITIVE_DATA", true ) &&
			 strncmp( command, GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					  strlen( GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION ) ) == 0 ) {
			dprintf( D_ALWAYS, "got stdin: %s XXXXXXXX\n",
					 GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION );
		} else {
			dprintf (D_ALWAYS, "got stdin: %s\n", command);
		}

		Gahp_Args args;

		if (parse_gahp_command (command, &args) &&
			verify_gahp_command (args.argv, args.argc)) {

				// Catch "special commands first
			if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) {
					// Print number of results
				std::string rn_buff;
				formatstr( rn_buff, "%d", result_list.number() );
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					rn_buff.c_str() };
				gahp_output_return (commands, 2);

					// Print each result line
				char * next;
				result_list.rewind();
				while ((next = result_list.next()) != NULL) {
					printf ("%s\n", next);
					fflush(stdout);
					dprintf(D_FULLDEBUG,"put stdout: %s\n",next);
					result_list.deleteCurrent();
				}

				new_results_signaled = FALSE;
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) {
				printf ("S %s\n", version);
				fflush (stdout);
				dprintf(D_FULLDEBUG,"put stdout: S %s\n",version);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				DC_Exit(0);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) {
				async_mode = TRUE;
				new_results_signaled = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) {
				async_mode = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				return 0; // exit
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) {
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					GAHP_COMMAND_DOWNLOAD_SANDBOX,
					GAHP_COMMAND_UPLOAD_SANDBOX,
					GAHP_COMMAND_DESTROY_SANDBOX,
					GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION,
					GAHP_COMMAND_CONDOR_VERSION,
					GAHP_COMMAND_ASYNC_MODE_ON,
					GAHP_COMMAND_ASYNC_MODE_OFF,
					GAHP_COMMAND_RESULTS,
					GAHP_COMMAND_QUIT,
					GAHP_COMMAND_VERSION,
					GAHP_COMMAND_COMMANDS};
				gahp_output_return (commands, 12);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CREATE_CONDOR_SECURITY_SESSION) == 0) {
				ClaimIdParser claimid( args.argv[1] );
				if ( !daemonCore->getSecMan()->CreateNonNegotiatedSecuritySession(
										DAEMON,
										claimid.secSessionId(),
										claimid.secSessionKey(),
										claimid.secSessionInfo(),
										CONDOR_PARENT_FQU,
										NULL,
										0 ) ) {
					gahp_output_return_error();
				} else {
					sec_session_id = claimid.secSessionId();
					gahp_output_return_success();
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_CONDOR_VERSION) == 0) {
				peer_condor_version = args.argv[1];

				const char *reply [] = { GAHP_RESULT_SUCCESS,
										 escapeGahpString( CondorVersion() ) };
				gahp_output_return( reply, 2 );

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[2] = {
						"Worker thread failed",
						"NULL"
					};
					enqueue_result(args.argv[1], res, 2);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					const char * res[1] = {
						"Worker thread failed"
					};
					enqueue_result(args.argv[1], res, 1);
					close( fds[0] );
				}

			} else {
				// should never get here if verify does its job
				dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]);
				gahp_output_return_error();
			}
			
		} else {
			gahp_output_return_error();
		}

		delete line;
	}

	// check if GetNextLine() returned NULL because of an error or EOF
	if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) {
		dprintf (D_ALWAYS, "stdin buffer closed, exiting\n");
		DC_Exit (1);
	}

	return TRUE;
}
Example #5
0
int
stdin_pipe_handler(Service*, int) {

	std::string* line;
	while ((line = stdin_buffer.GetNextLine()) != NULL) {

		const char * command = line->c_str();

		dprintf (D_ALWAYS, "got stdin: %s\n", command);

		Gahp_Args args;

		if (parse_gahp_command (command, &args) &&
			verify_gahp_command (args.argv, args.argc)) {

				// Catch "special commands first
			if (strcasecmp (args.argv[0], GAHP_COMMAND_RESULTS) == 0) {
					// Print number of results
				std::string rn_buff;
				formatstr( rn_buff, "%d", result_list.number() );
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					rn_buff.c_str() };
				gahp_output_return (commands, 2);

					// Print each result line
				char * next;
				result_list.rewind();
				while ((next = result_list.next()) != NULL) {
					printf ("%s\n", next);
					fflush(stdout);
					dprintf(D_FULLDEBUG,"put stdout: %s\n",next);
					result_list.deleteCurrent();
				}

				new_results_signaled = FALSE;
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_VERSION) == 0) {
				printf ("S %s\n", version);
				fflush (stdout);
				dprintf(D_FULLDEBUG,"put stdout: S %s\n",version);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				DC_Exit(0);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_ON) == 0) {
				async_mode = TRUE;
				new_results_signaled = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_ASYNC_MODE_OFF) == 0) {
				async_mode = FALSE;
				gahp_output_return_success();
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_QUIT) == 0) {
				gahp_output_return_success();
				return 0; // exit
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_COMMANDS) == 0) {
				const char * commands [] = {
					GAHP_RESULT_SUCCESS,
					GAHP_COMMAND_DOWNLOAD_SANDBOX,
					GAHP_COMMAND_UPLOAD_SANDBOX,
					GAHP_COMMAND_DESTROY_SANDBOX,
					GAHP_COMMAND_ASYNC_MODE_ON,
					GAHP_COMMAND_ASYNC_MODE_OFF,
					GAHP_COMMAND_RESULTS,
					GAHP_COMMAND_QUIT,
					GAHP_COMMAND_VERSION,
					GAHP_COMMAND_COMMANDS};
				gahp_output_return (commands, 10);
			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DOWNLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!\n" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_download_sandbox, (void*)strdup(command), NULL, download_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created download_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_UPLOAD_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!\n" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_upload_sandbox, (void*)strdup(command), NULL, upload_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created upload_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					close( fds[0] );
				}

			} else if (strcasecmp (args.argv[0], GAHP_COMMAND_DESTROY_SANDBOX) == 0) {

				int fds[2];
				if ( pipe( fds ) < 0 ) {
					EXCEPT( "Failed to create pipe!\n" );
				}
				ChildErrorPipe = fds[1];
				int tid = daemonCore->Create_Thread(do_command_destroy_sandbox, (void*)strdup(command), NULL, destroy_sandbox_reaper_id);

				close( fds[1] );
				if( tid ) {
					dprintf (D_ALWAYS, "BOSCO: created destroy_sandbox thread, id: %i\n", tid);

					// this is a "success" in the sense that the gahp command was
					// well-formatted.  whether or not the file transfer works or
					// not is not what we are reporting here.
					gahp_output_return_success();

					SandboxEnt e;
					e.pid = tid;
					e.request_id = args.argv[1];
					e.sandbox_id = args.argv[2];
					e.error_pipe = fds[0];
					// transfer started, record the entry in the map
					std::pair<int, struct SandboxEnt> p(tid, e);
					sandbox_map.insert(p);
				} else {
					dprintf (D_ALWAYS, "BOSCO: Create_Thread FAILED!\n");
					gahp_output_return_success();
					close( fds[0] );
				}

			} else {
				// should never get here if verify does its job
				dprintf(D_ALWAYS, "FTGAHP: got bad command: %s\n", args.argv[0]);
				gahp_output_return_error();
			}
			
		} else {
			gahp_output_return_error();
		}

		delete line;
	}

	// check if GetNextLine() returned NULL because of an error or EOF
	if (stdin_buffer.IsError() || stdin_buffer.IsEOF()) {
		dprintf (D_ALWAYS, "stdin buffer closed, exiting\n");
		DC_Exit (1);
	}

	return TRUE;
}
// clears all the inner structures and loads the configuration parameters'
// values again
void
ReplicatorStateMachine::reinitialize()
{
    // delete all configurations and start everything over from the scratch
    finalize( );
    AbstractReplicatorStateMachine::reinitialize( );

    m_myVersion.initialize( m_stateFilePath, m_versionFilePath );

    m_replicationInterval =
		param_integer("REPLICATION_INTERVAL",
					  5 * MINUTE,
					  0); // min value, must be positive
    // deduce HAD alive tolerance
    int hadConnectionTimeout =
		param_integer("HAD_CONNECTION_TIMEOUT",
					  DEFAULT_SEND_COMMAND_TIMEOUT,
					  0); // min value, must be positive
    m_maxTransfererLifeTime =
		param_integer("MAX_TRANSFER_LIFETIME",
					  5 * MINUTE,
					  0); // min value, must be positive
    m_newlyJoinedWaitingVersionInterval =
		param_integer("NEWLY_JOINED_WAITING_VERSION_INTERVAL",
					  NEWLY_JOINED_TOLERANCE_FACTOR * (hadConnectionTimeout + 1),
					  0); // min value, must be positive

    char* buffer = param( "HAD_LIST" );

    if ( buffer ) {
        StringList hadList;

        hadList.initializeFromString( buffer );
        free( buffer );
        m_hadAliveTolerance = HAD_ALIVE_TOLERANCE_FACTOR *
                            ( 2 * hadConnectionTimeout * hadList.number() + 1 );

        dprintf( D_FULLDEBUG, "ReplicatorStateMachine::reinitialize %s=%d\n",
                "HAD_LIST", m_hadAliveTolerance );
    } else {
        utilCrucialError( utilNoParameterError( "HAD_LIST", "HAD" ).Value( ));
    }

    initializeClassAd();
    int updateInterval = param_integer ( "REPLICATION_UPDATE_INTERVAL", 300 );
    if ( m_updateInterval != updateInterval ) {
        m_updateInterval = updateInterval;

        utilCancelTimer(m_updateCollectorTimerId);

        m_updateCollectorTimerId = daemonCore->Register_Timer ( 0,
               m_updateInterval,
               (TimerHandlercpp) &ReplicatorStateMachine::updateCollectors,
               "ReplicatorStateMachine::updateCollectors", this );
    }

    // set a timer to replication routine
    dprintf( D_ALWAYS, "ReplicatorStateMachine::reinitialize setting "
                                      "replication timer\n" );
    m_replicationTimerId = daemonCore->Register_Timer( m_replicationInterval,
            (TimerHandlercpp) &ReplicatorStateMachine::replicationTimer,
            "Time to replicate file", this );
    // register the download/upload reaper for the transferer process
    if( m_downloadReaperId == -1 ) {
		m_downloadReaperId = daemonCore->Register_Reaper(
        	"downloadReplicaTransfererReaper",
        (ReaperHandler)&ReplicatorStateMachine::downloadReplicaTransfererReaper,
        	"downloadReplicaTransfererReaper", this );
	}
    if( m_uploadReaperId == -1 ) {
		m_uploadReaperId = daemonCore->Register_Reaper(
        	"uploadReplicaTransfererReaper",
        (ReaperHandler) &ReplicatorStateMachine::uploadReplicaTransfererReaper,
        	"uploadReplicaTransfererReaper", this );
    }
	// for debugging purposes only
	printDataMembers( );
	
	beforePassiveStateHandler( );
}
Example #7
0
EC2Resource::BatchStatusResult EC2Resource::StartBatchStatus() {
    ASSERT( status_gahp );

    // m_checkSpotNext starts out false
    if( ! m_checkSpotNext ) {
        StringList returnStatus;
        std::string errorCode;
        int rc = status_gahp->ec2_vm_status_all( resourceName,
                    m_public_key_file, m_private_key_file,
                    returnStatus, errorCode );

        if( rc == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; }
    
        if( rc != 0 ) {
            std::string errorString = status_gahp->getErrorString();
            dprintf( D_ALWAYS, "Error doing batched EC2 status query: %s: %s.\n",
                     errorCode.c_str(), errorString.c_str() );
            return BSR_ERROR;
        }

        //
        // We have to let a job know if we can't find a status report for it.
        //
        List<EC2Job> myJobs;
        EC2Job * nextJob = NULL;
		BaseJob *nextBaseJob = NULL;
		registeredJobs.Rewind();
		while ( (nextBaseJob = registeredJobs.Next()) ) {
			nextJob = dynamic_cast< EC2Job * >( nextBaseJob );
			ASSERT( nextJob );
			if ( !nextJob->m_client_token.empty() ) {
				myJobs.Append( nextJob );
			}
		}

        returnStatus.rewind();
        ASSERT( returnStatus.number() % 6 == 0 );
        for( int i = 0; i < returnStatus.number(); i += 6 ) {
            std::string instanceID = returnStatus.next();
            std::string status = returnStatus.next();
            std::string clientToken = returnStatus.next();
            std::string keyName = returnStatus.next();
            std::string stateReasonCode = returnStatus.next();
            std::string publicDNSName = returnStatus.next();

            // Efficiency suggests we look via the instance ID first,
            // and then try to look things up via the client token
            // (or, for GT #3682, via the keypair ID).

            // We can't use BaseJob::JobsByRemoteId because OpenStack doesn't
            // include the client token in its status responses, and therefore
            // we can't always fully reconstruct the remoteJobID used as the key.
            EC2Job * job = NULL;
            rc = jobsByInstanceID.lookup( HashKey( instanceID.c_str() ), job );
            if( rc == 0 ) {
                ASSERT( job );
        
                dprintf( D_FULLDEBUG, "Found job object for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
                job->StatusUpdate( instanceID.c_str(), status.c_str(),
                                   stateReasonCode.c_str(), publicDNSName.c_str() );
                myJobs.Delete( job );
                continue;
            }

            // If we got a client token, use that to look up the job.  We
            // don't use the instance ID because we may discover it in
            // this function.  Since we need instance ID -based dispatch
            // code for OpenStack anyway, we'll just use it, rather than
            // trying the remoteJobID with the instance ID if we don't
            // find it using only the client token.
            if( ! clientToken.empty() && clientToken != "NULL" ) {
                std::string remoteJobID;
                formatstr( remoteJobID, "ec2 %s %s", resourceName, clientToken.c_str() );
                
                BaseJob * tmp = NULL;
                rc = BaseJob::JobsByRemoteId.lookup( HashKey( remoteJobID.c_str() ), tmp );
                
                if( rc == 0 ) {
                    ASSERT( tmp );
                    EC2Job * job = dynamic_cast< EC2Job * >( tmp );
                    if( job == NULL ) {
                        EXCEPT( "Found non-EC2Job identified by '%s'.", remoteJobID.c_str() );
                    }
                    
                    dprintf( D_FULLDEBUG, "Found job object via client token for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
                    job->StatusUpdate( instanceID.c_str(), status.c_str(),
                                       stateReasonCode.c_str(), publicDNSName.c_str() );
                    myJobs.Delete( job );
                    continue;
                }
            }
            
			// Some servers (OpenStack, Eucalyptus) silently ignore client
			// tokens. So we need to use the ssh keypair to find jobs that
			// were submitted but which we don't have an instance ID for.
			//
			// TODO This code should be made more efficient. We can
			//   do something better than a linear scan through all
			//   jobs for each status result. Ideally, we'd parse the
			//   ssh keypair name and if it looks like one we generated,
			//   pluck out the job id.
			if ( !ClientTokenWorks() && !keyName.empty() && keyName != "NULL" ) {
				myJobs.Rewind();
				while ( ( job = myJobs.Next() ) ) {
					if ( job->m_key_pair == keyName ) {
						dprintf( D_FULLDEBUG, "Found job object via ssh keypair for '%s', updating status ('%s').\n", instanceID.c_str(), status.c_str() );
						job->StatusUpdate( instanceID.c_str(), status.c_str(),
										   stateReasonCode.c_str(),
										   publicDNSName.c_str() );
						myJobs.Delete( job );
						continue;
					}
				}
			}

            dprintf( D_FULLDEBUG, "Found unknown instance '%s'; skipping.\n", instanceID.c_str() );
            continue;
        }
    
        myJobs.Rewind();
        while( ( nextJob = myJobs.Next() ) ) {
            dprintf( D_FULLDEBUG, "Informing job %p it got no status.\n", nextJob );
            nextJob->StatusUpdate( NULL, NULL, NULL, NULL );
        }
    
        // Don't ask for spot results unless we know about a spot job.  This
        // should prevent us from breaking OpenStack.
        if( spotJobsByRequestID.getNumElements() == 0 ) {
            m_checkSpotNext = false;
            return BSR_DONE;
        } else {
            m_checkSpotNext = true;
        }
    }
    
    if( m_checkSpotNext ) {
        StringList spotReturnStatus;
        std::string spotErrorCode;
        int spotRC = status_gahp->ec2_spot_status_all( resourceName,
                        m_public_key_file, m_private_key_file,
                        spotReturnStatus, spotErrorCode );

        if( spotRC == GAHPCLIENT_COMMAND_PENDING ) { return BSR_PENDING; }

        if( spotRC != 0 ) {
            std::string errorString = status_gahp->getErrorString();
            dprintf( D_ALWAYS, "Error doing batched EC2 spot status query: %s: %s.\n",
                     spotErrorCode.c_str(), errorString.c_str() );
            return BSR_ERROR;
        }

        List<EC2Job> mySpotJobs;
        EC2Job * nextSpotJob = NULL;
        spotJobsByRequestID.startIterations();
        while( spotJobsByRequestID.iterate( nextSpotJob ) ) {
            mySpotJobs.Append( nextSpotJob );
        }
    
        spotReturnStatus.rewind();
        ASSERT( spotReturnStatus.number() % 5 == 0 );
        for( int i = 0; i < spotReturnStatus.number(); i += 5 ) {
            std::string requestID = spotReturnStatus.next();
            std::string state = spotReturnStatus.next();
            /* std::string launchGroup = */ spotReturnStatus.next();
            /* std::string instanceID = */ spotReturnStatus.next();
            std::string statusCode = spotReturnStatus.next();
            
            EC2Job * spotJob = NULL;
            spotRC = spotJobsByRequestID.lookup( HashKey( requestID.c_str() ), spotJob );
            if( spotRC != 0 ) {
                dprintf( D_FULLDEBUG, "Found unknown spot request '%s'; skipping.\n", requestID.c_str() );
                continue;
            }
            ASSERT( spotJob );

            if( ! statusCode.empty() ) { state = statusCode; }

            dprintf( D_FULLDEBUG, "Found spot job object for '%s', updating status ('%s').\n", requestID.c_str(), state.c_str() );
            spotJob->StatusUpdate( NULL, state.c_str(), NULL, NULL );
            mySpotJobs.Delete( spotJob );
        }

        mySpotJobs.Rewind();
        while( ( nextSpotJob = mySpotJobs.Next() ) ) {
            dprintf( D_FULLDEBUG, "Informing spot job %p it got no status.\n", nextSpotJob );
            nextSpotJob->StatusUpdate( NULL, NULL, NULL, NULL );
        }
        
        m_checkSpotNext = false;
        return BSR_DONE;
    }

    // This should never happen (but the compiler hates you).
    return BSR_ERROR;
}