Пример #1
0
/******************************************************************************************
* @brief	sendResourceAlarm
*
* purpose:	send a trap and log the process information
*
******************************************************************************************/
bool ServerMonitor::sendResourceAlarm(string alarmItem, ALARMS alarmID, int action, int usage)
{
	ServerMonitor serverMonitor;
	Oam oam;

	//Log this event 
	LoggingID lid(SERVER_MONITOR_LOG_ID);
	MessageLog ml(lid);
	Message msg;
	Message::Args args;
	args.add(alarmItem);
	args.add(" usage at percentage of ");
	args.add(usage);

	// get current module name
	string moduleName;
	oamModuleInfo_t st;
	try {
		st = oam.getModuleInfo();
		moduleName = boost::get<0>(st);
	}
	catch (...) {
		moduleName = "Unknown Server";
	}

	// check if there is an active alarm above the reporting theshold 
	// that needs to be cleared

	if (alarmItem == "CPU")
		serverMonitor.checkCPUAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Disk" || alarmItem == "External")
			serverMonitor.checkDiskAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Memory")
			serverMonitor.checkMemoryAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Swap")
			serverMonitor.checkSwapAlarm(alarmItem, alarmID);

	// don't issue an alarm on thge dbroots is already issued by this or another server
	if ( alarmItem.find(startup::StartUp::installDir() + "/data") == 0 ) {
		// check if Alarm is already active from any module, don't resend
		if ( !( oam.checkActiveAlarm(alarmID, "*", alarmItem)) ) {
	
			SNMPManager alarmMgr;
			// send alarm
			alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
	
			args.add(", Alarm set: ");
			args.add(alarmID);
			msg.format(args);
			ml.logInfoMessage(msg);
			return true;
		}
		else
			return false;
	}
	else
	{
		// check if Alarm is already active from this module, don't resend
		if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) {
	
			SNMPManager alarmMgr;
			// send alarm
			alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
	
			args.add(", Alarm set: ");
			args.add(alarmID);
			msg.format(args);
			ml.logInfoMessage(msg);
			return true;
		}
		else
			return false;
	}

	return true;
}
Пример #2
0
/*****************************************************************************************
* @brief	cpuMonitor Thread
*
* purpose:	Get current CPU usage, average over 5 readings and report alarms
*
*****************************************************************************************/
void cpuMonitor()
{
	ServerMonitor serverMonitor;

	// register for Heartbeat monitoring
/*	try {
		ProcHeartbeat procheartbeat;
		procheartbeat.registerHeartbeat(CPU_HEARTBEAT_ID);
	}
	catch (exception& ex)
	{
		string error = ex.what();
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on registerHeartbeat: ");
		args.add(error);
		msg.format(args);
		ml.logErrorMessage(msg);
	}
	catch(...)
	{
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
		msg.format(args);
		ml.logErrorMessage(msg);
	}
*/
	int periodCount = 5;
	float cpuPeriod[periodCount];
	int periodCounter = 0;
	float averageCpuUsage = 0;
	currentCpuUsage = 0;

	// set defaults
	unsigned int cpuCritical = 0, 
				 cpuMajor = 0, 
				 cpuMinor = 0,
				 cpuMinorClear = 0;

	// initial cpu Period table
	for (int i =0;i < periodCount; i++)
	{
		cpuPeriod[i] = 0;
	}

	while(true)
	{
		// Get CPU usage water mark from server configuration and compare
		ModuleTypeConfig moduleTypeConfig;
		Oam oam;
		try {
			oam.getSystemConfig(moduleTypeConfig);
			cpuCritical = moduleTypeConfig.ModuleCPUCriticalThreshold; 
			cpuMajor = moduleTypeConfig.ModuleCPUMajorThreshold; 
			cpuMinor = moduleTypeConfig.ModuleCPUMinorThreshold;
			cpuMinorClear = moduleTypeConfig.ModuleCPUMinorClearThreshold;
		} catch (...)
		{
			sleep(5);
			continue;
		}

		if (RESOURCE_DEBUG)
			cout << "critical water: " << moduleTypeConfig.ModuleCPUCriticalThreshold << endl;

		pthread_mutex_lock(&CPU_LOCK);
		//
		// get Process and System CPU usage
		//
		serverMonitor.getCPUdata();

		// store and get average
		cpuPeriod[periodCounter] = currentCpuUsage;
		averageCpuUsage = 0;
		for (int i =0;i < periodCount; i++)
		{
			averageCpuUsage += cpuPeriod[i];
		}
		averageCpuUsage = averageCpuUsage / periodCount;

//		serverMonitor.logCPUactive(averageCpuUsage);
		if (CPU_DEBUG) {
			cout << "Current CPU Usage: " << currentCpuUsage << endl;
			cout << "Average CPU Usage: " << averageCpuUsage << endl;
		}

		if (averageCpuUsage >= cpuCritical && cpuCritical > 0 ) {
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_HIGH, SET, (int) averageCpuUsage);
		}
		else if (averageCpuUsage >= cpuMajor && cpuMajor > 0 )
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_MED, SET, (int) averageCpuUsage);
		else if (averageCpuUsage >= cpuMinor && cpuMinor > 0 )
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_LOW, SET, (int) averageCpuUsage);
		else if (averageCpuUsage >= cpuMinorClear && cpuMinorClear > 0 ) {
			serverMonitor.checkCPUAlarm("CPU", CPU_USAGE_LOW);
			//Log this event 
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Current CPU usage = ");
			args.add((int) currentCpuUsage);
			args.add(", Average CPU usage = ");
			args.add((int) averageCpuUsage);
			msg.format(args);
			ml.logInfoMessage(msg);
		}
		else
			serverMonitor.checkCPUAlarm("CPU");

		//
		// check CPU usage by process
		//
		ProcessCPUList::iterator p = pcl.begin();
		while(p != pcl.end())
		{
			string processName =  (*p).processName;
			double cpuUsage =  (*p).usedPercent;
			p++;

			if (CPU_DEBUG) {
				cout << "Process Name : " << processName << endl;
				cout << "CPU Usage: " << cpuUsage << endl;
			}

			// check if a Calpont Process, if so alarm is over thresholds
			// if not, just log if over thresholds
			if (cpuUsage >= cpuCritical && cpuCritical > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_HIGH, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Critical CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
			else if (cpuUsage >= cpuMajor && cpuMajor > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_MED, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Major CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
			else if (cpuUsage >= cpuMinor && cpuMinor > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_LOW, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Minor CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
/*			else if (cpuUsage >= cpuMinorClear) {
				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.checkCPUAlarm(processName, CPU_USAGE_LOW);
				}
				catch (...) {}
			}
			else
				serverMonitor.checkCPUAlarm(processName);
*/		}

		// send heartbeat message
/*		try {
			ProcHeartbeat procheartbeat;
			procheartbeat.sendHeartbeat(CPU_HEARTBEAT_ID);

			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Sent Heartbeat Msg");
			msg.format(args);
			ml.logInfoMessage(msg);
		}
		catch (exception& ex)
		{
			string error = ex.what();
			if ( error.find("Disabled") == string::npos ) {
				LoggingID lid(SERVER_MONITOR_LOG_ID);
				MessageLog ml(lid);
				Message msg;
				Message::Args args;
				args.add("EXCEPTION ERROR on sendHeartbeat: ");
				args.add(error);
				msg.format(args);
				ml.logErrorMessage(msg);
			}
		}
		catch(...)
		{
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
			msg.format(args);
			ml.logErrorMessage(msg);
		}
*/

		pthread_mutex_unlock(&CPU_LOCK);

		// sleep, 5 minutes
		sleep(MONITOR_PERIOD*5);

		++periodCounter;
		if ( periodCounter >= periodCount )
			periodCounter = 0;

	} // end of while loop
}