Beispiel #1
0
/******************************************************************************************
* @brief	sendAlarm
*
* purpose:	send a trap and log the process information
*
******************************************************************************************/
void ServerMonitor::sendAlarm(string alarmItem, ALARMS alarmID, int action, float sensorValue)
{
	ServerMonitor serverMonitor;
	Oam oam;

	//Log this event 
	LoggingID lid(SERVER_MONITOR_LOG_ID);
	MessageLog ml(lid);
	Message msg;
	Message::Args args;
	args.add(alarmItem);
	args.add(", sensor value out-of-range: ");
	args.add(sensorValue);

	// get current server name
	string moduleName;
	oamModuleInfo_t st;
	try {
		st = oam.getModuleInfo();
		moduleName = boost::get<0>(st);
	}
	catch (...) {
		moduleName = "Unknown Server";
	}

	// check if there is an active alarm above the reporting theshold 
	// that needs to be cleared
	serverMonitor.checkAlarm(alarmItem, alarmID);

	// check if Alarm is already active, don't resend
	if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) {

		SNMPManager alarmMgr;
		// send alarm
		alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);

		args.add(", Alarm set: ");
		args.add(alarmID);
	}

	// output log
	msg.format(args);
	ml.logWarningMessage(msg);

	return;
}
int main(int argc, char* argv[])
{	
	::CoInitializeEx(NULL, COINIT_MULTITHREADED);
	g_appPath = Linkwork::Win32::GetAppPath();
	g_strconfigfile= g_appPath+ "ServerMonitor.ini";
	g_strserverini = g_appPath + "Server.ini";
//	g_strAlertServerini = g_appPath.substr(0,g_appPath.rfind('\\')) + "";
	

	SetCurrentDirectory(g_appPath.c_str());
	gLogger.init(g_strconfigfile.c_str(), "ServerMonitor");

#ifndef _DEBUG
	SetAutoMinidump();
#endif
	ServerMonitor sm;

	if ( (argc > 1) &&
		((*argv[1] == '-') || (*argv[1] == '/')) )
	{
		if ( _stricmp( "debug", argv[1]+1 ) == 0 )
		{
			//调试运行
			sm.debugservice();
			return TRUE;
		}	
	}

	if(!sm.startservice("ESServerMonitor"))
	{
		gLogger.debug("服务启动失败");
	}
	
	return 0;
	
}
Beispiel #3
0
/******************************************************************************************
* @brief	sendResourceAlarm
*
* purpose:	send a trap and log the process information
*
******************************************************************************************/
bool ServerMonitor::sendResourceAlarm(string alarmItem, ALARMS alarmID, int action, int usage)
{
	ServerMonitor serverMonitor;
	Oam oam;

	//Log this event 
	LoggingID lid(SERVER_MONITOR_LOG_ID);
	MessageLog ml(lid);
	Message msg;
	Message::Args args;
	args.add(alarmItem);
	args.add(" usage at percentage of ");
	args.add(usage);

	// get current module name
	string moduleName;
	oamModuleInfo_t st;
	try {
		st = oam.getModuleInfo();
		moduleName = boost::get<0>(st);
	}
	catch (...) {
		moduleName = "Unknown Server";
	}

	// check if there is an active alarm above the reporting theshold 
	// that needs to be cleared

	if (alarmItem == "CPU")
		serverMonitor.checkCPUAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Disk" || alarmItem == "External")
			serverMonitor.checkDiskAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Memory")
			serverMonitor.checkMemoryAlarm(alarmItem, alarmID);
	else if (alarmItem == "Local Swap")
			serverMonitor.checkSwapAlarm(alarmItem, alarmID);

	// don't issue an alarm on thge dbroots is already issued by this or another server
	if ( alarmItem.find(startup::StartUp::installDir() + "/data") == 0 ) {
		// check if Alarm is already active from any module, don't resend
		if ( !( oam.checkActiveAlarm(alarmID, "*", alarmItem)) ) {
	
			SNMPManager alarmMgr;
			// send alarm
			alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
	
			args.add(", Alarm set: ");
			args.add(alarmID);
			msg.format(args);
			ml.logInfoMessage(msg);
			return true;
		}
		else
			return false;
	}
	else
	{
		// check if Alarm is already active from this module, don't resend
		if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) {
	
			SNMPManager alarmMgr;
			// send alarm
			alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action);
	
			args.add(", Alarm set: ");
			args.add(alarmID);
			msg.format(args);
			ml.logInfoMessage(msg);
			return true;
		}
		else
			return false;
	}

	return true;
}
Beispiel #4
0
/*****************************************************************************************
* @brief	diskMonitor Thread
*
* purpose:	Get current Local and External disk usage and report alarms
*
*****************************************************************************************/
void diskMonitor()
{
	ServerMonitor serverMonitor;
	Oam oam;
    SystemConfig systemConfig;
	ModuleTypeConfig moduleTypeConfig;
	typedef std::vector<std::string> LocalFileSystems;
	LocalFileSystems lfs;
	struct statvfs buf; 

	// set defaults
	int localDiskCritical = 90,
		localDiskMajor = 80,
		localDiskMinor = 70,
		ExternalDiskCritical = 90,
		ExternalDiskMajor = 80,
		ExternalDiskMinor = 70;

	// get module types
	string moduleType;
	int moduleID=-1;
	string moduleName;
	oamModuleInfo_t t;
	try {
		t = oam.getModuleInfo();
		moduleType = boost::get<1>(t);
		moduleID = boost::get<2>(t);
		moduleName = boost::get<0>(t);
	}
	catch (exception& e) {}

	bool Externalflag = false;

	//check for external disk
	DBrootList dbrootList;
	if (moduleType == "pm") {
		systemStorageInfo_t t;
		t = oam.getStorageConfig();
		if ( boost::get<0>(t) == "external")
			Externalflag = true;

		// get dbroot list and storage type from config file
		DBRootConfigList dbrootConfigList;
		oam.getPmDbrootConfig(moduleID, dbrootConfigList);
	
		DBRootConfigList::iterator pt = dbrootConfigList.begin();
		for( ; pt != dbrootConfigList.end() ; pt++)
		{
			int dbrootID = *pt;
	
			string dbroot = "DBRoot" + oam.itoa(dbrootID);
	
			string dbootdir;
			try{
				oam.getSystemConfig(dbroot, dbootdir);
			}
			catch(...) {}
	
			if ( dbootdir.empty() || dbootdir == "" )
				continue;
	
			DBrootData dbrootData;
			dbrootData.dbrootDir = dbootdir;
			dbrootData.downFlag = false;
	
			dbrootList.push_back(dbrootData);
		}
	}

	string cloud = oam::UnassignedName;
	try {
		oam.getSystemConfig( "Cloud", cloud);
	}
	catch(...) {
		cloud = oam::UnassignedName;
	}

	//get Gluster Config setting
	string GlusterConfig = "n";
	try {
		oam.getSystemConfig( "GlusterConfig", GlusterConfig);
	}
	catch(...)
	{
		GlusterConfig = "n";
	}

	int diskSpaceCheck = 0;

	while(true)
	{
		SystemStatus systemstatus;
		try {
			oam.getSystemStatus(systemstatus);
		}
		catch (exception& ex)
		{}
		
		if (systemstatus.SystemOpState != oam::ACTIVE ) {
			sleep(5);
			continue;
		}

		// Get Local/External Disk Mount points to monitor and associated thresholds
		
		try {
			oam.getSystemConfig (moduleTypeConfig);
			localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold; 
			localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold; 
			localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold;

			DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin();
			for( ; p != moduleTypeConfig.FileSystems.end() ; p++)
			{
				string fs = *p;
				lfs.push_back(fs);

				if (DISK_DEBUG) {
					//Log this event 
					LoggingID lid(SERVER_MONITOR_LOG_ID);
					MessageLog ml(lid);
					Message msg;
					Message::Args args;
					args.add("Local Config File System to monitor =");
					args.add(fs);
					msg.format(args);
					ml.logDebugMessage(msg);
				}
			}

		} catch (...)
		{
			sleep(5);
			continue;
		}

		// get External info
		try
		{
			oam.getSystemConfig(systemConfig);

		} catch (...)
		{
			sleep(5);
			continue;
		}

		if (Externalflag) {
			// get External info
			try
			{
				ExternalDiskCritical = systemConfig.ExternalCriticalThreshold;
				ExternalDiskMajor = systemConfig.ExternalMajorThreshold;
				ExternalDiskMinor = systemConfig.ExternalMinorThreshold;

			} catch (...)
			{
				sleep(5);
				continue;
			}
		}

		//check for local file systems
		LocalFileSystems::iterator p = lfs.begin();
		while(p != lfs.end())
		{
			string deviceName = *p;
			++p;
			string fileName;
			// check local
			if ( deviceName == "/") {
				fileName = deviceName + "usr/local/Calpont/releasenum";
			}
			else
			{
				fileName = deviceName + "/000.dir";
			}

			uint64_t totalBlocks;
			uint64_t usedBlocks;

			if (!statvfs(fileName.c_str(), &buf)) {

				uint64_t blksize, blocks, freeblks, free; 

				blksize = buf.f_bsize; 
				blocks = buf.f_blocks; 
				freeblks = buf.f_bfree; 

				totalBlocks = blocks * blksize;
				free = freeblks * blksize; 
				usedBlocks = totalBlocks - free; 
			}
			else
				continue;

			int64_t diskUsage = 0;
			if ( totalBlocks == 0 ) {
				diskUsage = 0;
	
				//Log this event 
				LoggingID lid(SERVER_MONITOR_LOG_ID);
				MessageLog ml(lid);
				Message msg;
				Message::Args args;
				args.add("Total Disk Usage is set to 0");
				msg.format(args);
				ml.logWarningMessage(msg);
			}
			else
				diskUsage =  (usedBlocks / (totalBlocks / 100)) + 1;

			SMSystemDisk sd;
			sd.deviceName = deviceName;
			sd.usedPercent = diskUsage;
			sd.totalBlocks = totalBlocks;
			sd.usedBlocks = usedBlocks;
			sdl.push_back(sd);

			if (DISK_DEBUG)
				cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
	
			if ( diskSpaceCheck == 0 )
			{
				if (diskUsage >= localDiskCritical && localDiskCritical > 0 ) {
					//adjust if over 100%
					if ( diskUsage > 100 )
						diskUsage = 100;
					if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) )
					{
						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Local Disk above Critical Disk threshold with a percentage of ");
						args.add((int) diskUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
					}
				}
				else if (diskUsage >= localDiskMajor && localDiskMajor > 0 ) {
					if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage))
					{
						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Local Disk above Major Disk threshold with a percentage of ");
						args.add((int) diskUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
					}
				}
				else if (diskUsage >= localDiskMinor && localDiskMinor > 0 ) {
					if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage))
					{
						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Local Disk above Minor Disk threshold with a percentage of ");
						args.add((int) diskUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
					}
				}
				else
					serverMonitor.checkDiskAlarm(deviceName);
			}
	
			//check for external file systems/devices
			if (Externalflag ||
				(!Externalflag && GlusterConfig == "y" && moduleType == "pm") ){
				try
				{
					DBRootConfigList dbrootConfigList;
					oam.getPmDbrootConfig(moduleID, dbrootConfigList);
	
					DBRootConfigList::iterator pt = dbrootConfigList.begin();
					for( ; pt != dbrootConfigList.end() ; pt++)
					{
						int dbroot = *pt;
						string deviceName = systemConfig.DBRoot[dbroot-1];
						string fileName = deviceName + "/000.dir";
			
						if (DISK_DEBUG) {
							//Log this event 
							LoggingID lid(SERVER_MONITOR_LOG_ID);
							MessageLog ml(lid);
							Message msg;
							Message::Args args;
							args.add("DBRoots monitoring");
							args.add(dbroot);
							args.add(" ,file system =" );
							args.add(fileName);
							msg.format(args);
							ml.logDebugMessage(msg);
						}
	
						uint64_t totalBlocks;
						uint64_t usedBlocks;
			
						if (!statvfs(fileName.c_str(), &buf)) {
			
							uint64_t blksize, blocks, freeblks, free; 
			
							blksize = buf.f_bsize; 
							blocks = buf.f_blocks; 
							freeblks = buf.f_bfree; 
			
							totalBlocks = blocks * blksize;
							free = freeblks * blksize; 
							usedBlocks = totalBlocks - free; 
						}
						else
						{
							SMSystemDisk sd;
							sd.deviceName = deviceName;
							sd.usedPercent = 0;
							sd.totalBlocks = 0;
							sd.usedBlocks = 0;
							sdl.push_back(sd);
							continue;
						}
			
						int diskUsage = 0;
						if ( totalBlocks == 0 ) {
							diskUsage = 0;
				
							//Log this event 
							LoggingID lid(SERVER_MONITOR_LOG_ID);
							MessageLog ml(lid);
							Message msg;
							Message::Args args;
							args.add("Total Disk Usage is set to 0");
							msg.format(args);
							ml.logWarningMessage(msg);
						}
						else
							diskUsage =  (usedBlocks / (totalBlocks / 100)) + 1;
			
						SMSystemDisk sd;
						sd.deviceName = deviceName;
						sd.usedPercent = diskUsage;
						sd.totalBlocks = totalBlocks;
						sd.usedBlocks = usedBlocks;
						sdl.push_back(sd);
		
						if (DISK_DEBUG)
							cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl;
			
						if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 ) {
							//adjust if over 100%
							if ( diskUsage > 100 )
								diskUsage = 100;
							if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage))
							{
								LoggingID lid(SERVER_MONITOR_LOG_ID);
								MessageLog ml(lid);
								Message msg;
								Message::Args args;
								args.add("Disk usage for");
								args.add(deviceName);
								args.add(" above Critical Disk threshold with a percentage of ");
								args.add((int) diskUsage);
								msg.format(args);
								ml.logInfoMessage(msg);
							}
						}
						else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 ) {
							if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage))
							{
								LoggingID lid(SERVER_MONITOR_LOG_ID);
								MessageLog ml(lid);
								Message msg;
								Message::Args args;
								args.add("Disk usage for");
								args.add(deviceName);
								args.add(" above Major Disk threshold with a percentage of ");
								args.add((int) diskUsage);
								msg.format(args);
								ml.logInfoMessage(msg);
							}
						}
						else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 ) {
							if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage))
							{
								LoggingID lid(SERVER_MONITOR_LOG_ID);
								MessageLog ml(lid);
								Message msg;
								Message::Args args;
								args.add("Disk usage for");
								args.add(deviceName);
								args.add(" above Minor Disk threshold with a percentage of ");
								args.add((int) diskUsage);
								msg.format(args);
								ml.logInfoMessage(msg);
							}
						}
						else
							serverMonitor.checkDiskAlarm(deviceName);
					}
				}
				catch (exception& e)
				{
					cout << endl << "**** getPmDbrootConfig Failed :  " << e.what() << endl;
				}
			}
		}

		//check OAM dbroot test flag to validate dbroot exist if on pm
		if ( moduleName.find("pm") != string::npos ) {
			//check OAM dbroot test flag to validate dbroot exist
			if ( dbrootList.size() != 0 ) {
				DBrootList::iterator p = dbrootList.begin();
				while ( p != dbrootList.end() )
				{
					//get dbroot directory
					string dbrootDir = (*p).dbrootDir;
					string dbrootName;
					string dbrootID;

					//get dbroot name
					string::size_type pos = dbrootDir.rfind("/",80);
					if (pos != string::npos)
						dbrootName = dbrootDir.substr(pos+1,80);

					//get ID
					dbrootID = dbrootName.substr(4,80);
			
					string fileName = dbrootDir + "/OAMdbrootCheck";
					// retry in case we hit the remount window
					for ( int retry = 0 ; ; retry++ )
					{
						bool fail = false;
						//first test, check if OAMdbrootCheck exists
						ifstream file (fileName.c_str());
						if (!file)
							fail = true;
						else
						{	//second test for amazon, check volume status
							if ( cloud != oam::UnassignedName ) {
								string volumeNameID = "PMVolumeName" + dbrootID;
								string volumeName = oam::UnassignedName;
								try {
									oam.getSystemConfig( volumeNameID, volumeName);
								}
								catch(...)
								{}
							
								if ( volumeName.empty() || volumeName == oam::UnassignedName )
									fail = false;
								else
								{
									string status = oam.getEC2VolumeStatus(volumeName);
									if ( status == "attached" )
										fail = false;
									else
									{
										fail = true;
										LoggingID lid(SERVER_MONITOR_LOG_ID);
										MessageLog ml(lid);
										Message msg;
										Message::Args args;
										args.add("dbroot monitoring: Volume not attached");
										args.add(volumeName);
										args.add("/");
										args.add(dbrootName);
										msg.format(args);
										ml.logCriticalMessage(msg);
									}
								}
							}
							else
								fail = false;
						}

						if (fail) {
							//double check system status before reporting any error BUG 5078
							SystemStatus systemstatus;
							try {
								oam.getSystemStatus(systemstatus);
							}
							catch (exception& ex)
							{}
							
							if (systemstatus.SystemOpState != oam::ACTIVE ) {
								break;
							}

							if ( retry < 10 ) {
								sleep(3);
								continue;
							}
							else
							{
								if ( !(*p).downFlag ) {
									LoggingID lid(SERVER_MONITOR_LOG_ID);
									MessageLog ml(lid);
									Message msg;
									Message::Args args;
									args.add("dbroot monitoring: Lost access to ");
									args.add(dbrootDir);
									msg.format(args);
									ml.logCriticalMessage(msg);

									oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName);
									(*p).downFlag = true;

									try{
										oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE);
									}
									catch (exception& ex)
									{}

									break;
								}
							}
						}
						else
						{
							if ( (*p).downFlag ) {
								LoggingID lid(SERVER_MONITOR_LOG_ID);
								MessageLog ml(lid);
								Message msg;
								Message::Args args;
								args.add("dbroot monitoring: Access back to ");
								args.add(dbrootDir);
								msg.format(args);
								ml.logInfoMessage(msg);
		
								oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName);
								(*p).downFlag = false;

								try{
									oam.setDbrootStatus(dbrootID, oam::ACTIVE);
								}
								catch (exception& ex)
								{}
							}
							file.close();
							break;
						}
					}
					p++;
				}
			}
		}

		//do Gluster status check, if configured
		if ( GlusterConfig == "y")
		{
			bool pass = true;
			string errmsg = "unknown";
			try {
				string arg1 = "";
				string arg2 = "";
				int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg);
				if ( ret != 0 )
				{
					cerr << "FAILURE: Status check error: " + errmsg << endl;
					pass = false;
				}
			}
			catch (exception& e)
			{
				cerr << endl << "**** glusterctl API exception:  " << e.what() << endl;
				cerr << "FAILURE: Status check error" << endl;
				pass = false;
			}
			catch (...)
			{
				cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl;
				cerr << "FAILURE: Status check error" << endl;
				pass = false;
			}

			if ( !pass )
			{ // issue log and alarm
				LoggingID lid(SERVER_MONITOR_LOG_ID);
				MessageLog ml(lid);
				Message msg;
				Message::Args args;
				args.add("Gluster Status check failure error msg: ");
				args.add(errmsg);
				msg.format(args);
				ml.logWarningMessage(msg);
				serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0);
			}
		}

		// sleep 10 seconds
		sleep(MONITOR_PERIOD/6);

		//check disk space every 10 minutes
		diskSpaceCheck++;
		if ( diskSpaceCheck >= 60 )
			diskSpaceCheck = 0;

		lfs.clear();
		sdl.clear();

	} // end of while loop
}
Beispiel #5
0
void hardwareMonitor(int IPMI_SUPPORT)
{
	ServerMonitor serverMonitor;
	string data[10];
	string SensorName;
	float SensorValue;
    string Units;
	string SensorStatus;
	float lowFatal;
	float lowCritical;
	float lowWarning;
	float highWarning;
	float highCritical;
	float highFatal;
	char *p;

	if( IPMI_SUPPORT == 0) {
		int returnCode = system("ipmitool sensor list > /tmp/harwareMonitor.txt");
		if (returnCode) {
			// System error, Log this event 
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Error running ipmitool sensor list!!!");
			msg.format(args);
			ml.logWarningMessage(msg);
			while(TRUE)
				sleep(10000);
		}
	}
	else
	{
		while(TRUE)
			sleep(10000);
	}

	// register for Heartbeat monitoring
/*	try {
		ProcHeartbeat procheartbeat;
		procheartbeat.registerHeartbeat(HW_HEARTBEAT_ID);
	}
	catch (exception& ex)
	{
		string error = ex.what();
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on registerHeartbeat: ");
		args.add(error);
		msg.format(args);
		ml.logErrorMessage(msg);
	}
	catch(...)
	{
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
		msg.format(args);
		ml.logErrorMessage(msg);
	}
*/
	// loop forever reading the hardware status
	while(TRUE)
	{
		// parse output file
	
		ifstream File ("/tmp/harwareMonitor.txt");
		if (!File){
			// System error, Log this event 
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Error opening /tmp/harwareMonitor.txt!!!");
			msg.format(args);
			ml.logWarningMessage(msg);
			sleep(300);
			continue;
		}
		
		char line[200];
		while (File.getline(line, 200))
		{
			// parse the line
			int f = 0;
			p = strtok(line,"|");
			while (p) 
			{
				data[f]=p;
				data[f] = serverMonitor.StripWhitespace(data[f]);
				p = strtok (NULL, "|");
				f++;
			}
	
			if( f == 0 )
				// nothing on this line, skip
				continue;
	
			SensorName = data[0];
			SensorValue = atof(data[1].c_str());
			Units = data[2];
			SensorStatus = data[3];
			lowFatal = atof(data[4].c_str());
			lowCritical = atof(data[5].c_str());
			lowWarning = atof(data[6].c_str());
			highWarning = atof(data[7].c_str());
			highCritical = atof(data[8].c_str());
			highFatal = atof(data[9].c_str());

			// check status and issue apporiate alarm if needed
			if ( (SensorStatus != "ok") && (SensorStatus != "nr") && (SensorStatus != "na") ) {
				// Status error, check for warning or critical levels

				if ( SensorValue >= highFatal ) {
					// issue critical alarm and send message to shutdown Server
					serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
					serverMonitor.sendMsgShutdownServer();
				}
				else if ( (SensorValue < highFatal) && (SensorValue >= highCritical) )
					// issue major alarm
					serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);

				else if ( (SensorValue < highCritical ) && (SensorValue >= highWarning) )
					// issue minor alarm
					serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);

				else if ( (SensorValue <= lowWarning) && (SensorValue > lowCritical) )
					// issue minor alarm
					serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue);

				else if ( (SensorValue <= lowCritical) && (SensorValue > lowFatal) )
					// issue major alarm
					serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue);

				else if ( SensorValue <= lowFatal ) {
					// issue critical alarm and send message to shutdown Server
					serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue);
					serverMonitor.sendMsgShutdownServer();
				}
				else
					// check if there are any active alarms that needs to be cleared
					serverMonitor.checkAlarm(SensorName);
			}
			else
				// check if there are any active alarms that needs to be cleared
				serverMonitor.checkAlarm(SensorName);

		} //end of parsing file while
		
		File.close();

		// send heartbeat message
/*		try {
			ProcHeartbeat procheartbeat;
			procheartbeat.sendHeartbeat(HW_HEARTBEAT_ID);

			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Sent Heartbeat Msg");
			msg.format(args);
			ml.logDebugMessage(msg);
		}
		catch (exception& ex)
		{
			string error = ex.what();
			if ( error.find("Disabled") == string::npos ) {
				LoggingID lid(SERVER_MONITOR_LOG_ID);
				MessageLog ml(lid);
				Message msg;
				Message::Args args;
				args.add("EXCEPTION ERROR on sendHeartbeat: ");
				args.add(error);
				msg.format(args);
				ml.logErrorMessage(msg);
			}
		}
		catch(...)
		{
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
			msg.format(args);
			ml.logErrorMessage(msg);
		}
*/
		// sleep
		sleep(MONITOR_PERIOD);
	} //end of forever while loop
}
Beispiel #6
0
/*****************************************************************************************
* @brief	cpuMonitor Thread
*
* purpose:	Get current CPU usage, average over 5 readings and report alarms
*
*****************************************************************************************/
void cpuMonitor()
{
	ServerMonitor serverMonitor;

	// register for Heartbeat monitoring
/*	try {
		ProcHeartbeat procheartbeat;
		procheartbeat.registerHeartbeat(CPU_HEARTBEAT_ID);
	}
	catch (exception& ex)
	{
		string error = ex.what();
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on registerHeartbeat: ");
		args.add(error);
		msg.format(args);
		ml.logErrorMessage(msg);
	}
	catch(...)
	{
		LoggingID lid(SERVER_MONITOR_LOG_ID);
		MessageLog ml(lid);
		Message msg;
		Message::Args args;
		args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
		msg.format(args);
		ml.logErrorMessage(msg);
	}
*/
	int periodCount = 5;
	float cpuPeriod[periodCount];
	int periodCounter = 0;
	float averageCpuUsage = 0;
	currentCpuUsage = 0;

	// set defaults
	unsigned int cpuCritical = 0, 
				 cpuMajor = 0, 
				 cpuMinor = 0,
				 cpuMinorClear = 0;

	// initial cpu Period table
	for (int i =0;i < periodCount; i++)
	{
		cpuPeriod[i] = 0;
	}

	while(true)
	{
		// Get CPU usage water mark from server configuration and compare
		ModuleTypeConfig moduleTypeConfig;
		Oam oam;
		try {
			oam.getSystemConfig(moduleTypeConfig);
			cpuCritical = moduleTypeConfig.ModuleCPUCriticalThreshold; 
			cpuMajor = moduleTypeConfig.ModuleCPUMajorThreshold; 
			cpuMinor = moduleTypeConfig.ModuleCPUMinorThreshold;
			cpuMinorClear = moduleTypeConfig.ModuleCPUMinorClearThreshold;
		} catch (...)
		{
			sleep(5);
			continue;
		}

		if (RESOURCE_DEBUG)
			cout << "critical water: " << moduleTypeConfig.ModuleCPUCriticalThreshold << endl;

		pthread_mutex_lock(&CPU_LOCK);
		//
		// get Process and System CPU usage
		//
		serverMonitor.getCPUdata();

		// store and get average
		cpuPeriod[periodCounter] = currentCpuUsage;
		averageCpuUsage = 0;
		for (int i =0;i < periodCount; i++)
		{
			averageCpuUsage += cpuPeriod[i];
		}
		averageCpuUsage = averageCpuUsage / periodCount;

//		serverMonitor.logCPUactive(averageCpuUsage);
		if (CPU_DEBUG) {
			cout << "Current CPU Usage: " << currentCpuUsage << endl;
			cout << "Average CPU Usage: " << averageCpuUsage << endl;
		}

		if (averageCpuUsage >= cpuCritical && cpuCritical > 0 ) {
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_HIGH, SET, (int) averageCpuUsage);
		}
		else if (averageCpuUsage >= cpuMajor && cpuMajor > 0 )
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_MED, SET, (int) averageCpuUsage);
		else if (averageCpuUsage >= cpuMinor && cpuMinor > 0 )
			serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_LOW, SET, (int) averageCpuUsage);
		else if (averageCpuUsage >= cpuMinorClear && cpuMinorClear > 0 ) {
			serverMonitor.checkCPUAlarm("CPU", CPU_USAGE_LOW);
			//Log this event 
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Current CPU usage = ");
			args.add((int) currentCpuUsage);
			args.add(", Average CPU usage = ");
			args.add((int) averageCpuUsage);
			msg.format(args);
			ml.logInfoMessage(msg);
		}
		else
			serverMonitor.checkCPUAlarm("CPU");

		//
		// check CPU usage by process
		//
		ProcessCPUList::iterator p = pcl.begin();
		while(p != pcl.end())
		{
			string processName =  (*p).processName;
			double cpuUsage =  (*p).usedPercent;
			p++;

			if (CPU_DEBUG) {
				cout << "Process Name : " << processName << endl;
				cout << "CPU Usage: " << cpuUsage << endl;
			}

			// check if a Calpont Process, if so alarm is over thresholds
			// if not, just log if over thresholds
			if (cpuUsage >= cpuCritical && cpuCritical > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_HIGH, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Critical CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
			else if (cpuUsage >= cpuMajor && cpuMajor > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_MED, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Major CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
			else if (cpuUsage >= cpuMinor && cpuMinor > 0) {
/*				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.sendResourceAlarm(processName, CPU_USAGE_LOW, SET, (int) cpuUsage);
				}
				catch (...) {
*/						LoggingID lid(SERVER_MONITOR_LOG_ID);
						MessageLog ml(lid);
						Message msg;
						Message::Args args;
						args.add("Process");
						args.add(processName);
						args.add(" above Minor CPU threshold with a percentage of ");
						args.add((int) cpuUsage);
						msg.format(args);
						ml.logInfoMessage(msg);
//				}
			}
/*			else if (cpuUsage >= cpuMinorClear) {
				try {
					t = oam.getMyProcessStatus(processID);
					processName = boost::get<1>(t);

					serverMonitor.checkCPUAlarm(processName, CPU_USAGE_LOW);
				}
				catch (...) {}
			}
			else
				serverMonitor.checkCPUAlarm(processName);
*/		}

		// send heartbeat message
/*		try {
			ProcHeartbeat procheartbeat;
			procheartbeat.sendHeartbeat(CPU_HEARTBEAT_ID);

			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("Sent Heartbeat Msg");
			msg.format(args);
			ml.logInfoMessage(msg);
		}
		catch (exception& ex)
		{
			string error = ex.what();
			if ( error.find("Disabled") == string::npos ) {
				LoggingID lid(SERVER_MONITOR_LOG_ID);
				MessageLog ml(lid);
				Message msg;
				Message::Args args;
				args.add("EXCEPTION ERROR on sendHeartbeat: ");
				args.add(error);
				msg.format(args);
				ml.logErrorMessage(msg);
			}
		}
		catch(...)
		{
			LoggingID lid(SERVER_MONITOR_LOG_ID);
			MessageLog ml(lid);
			Message msg;
			Message::Args args;
			args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!");
			msg.format(args);
			ml.logErrorMessage(msg);
		}
*/

		pthread_mutex_unlock(&CPU_LOCK);

		// sleep, 5 minutes
		sleep(MONITOR_PERIOD*5);

		++periodCounter;
		if ( periodCounter >= periodCount )
			periodCounter = 0;

	} // end of while loop
}
Beispiel #7
0
/*****************************************************************************************
* @brief	logCPUactive
*
* purpose:	Log Peak and Average CPU usage 
*
*****************************************************************************************/
void ServerMonitor::logCPUactive (unsigned int cpuUsage)
{
	ServerMonitor serverMonitor;

	// determin the active log file name
	string usageLogFileName = FE_MOUNT_DIR;
	usageLogFileName = usageLogFileName + "cpu.log";
	
	if (RESOURCE_DEBUG)
		cout << usageLogFileName << endl;
	
	fstream usageLogFile;
	usageLogFile.open (usageLogFileName.c_str(), ios::in|ios::out);

	if (usageLogFile.fail())
	{
		ofstream file (usageLogFileName.c_str());
		file.close();
		usageLogFile.open(usageLogFileName.c_str(), ios::in|ios::out);
		if (!usageLogFile) cout << "--error" << endl;
	}
	
	// get the counter
	usageLogFile.seekg(0, ios::beg);
	usageLogFile.read (reinterpret_cast<char *>(&usageCount), sizeof (int));
	if (usageLogFile.eof()) usageLogFile.clear();

	// new iteration
	if (usageCount == 0)
	{
		usageLogFile.seekp(0, ios::beg);
		usageLogFile.write (reinterpret_cast<char *>(&usageCount), sizeof (int));
	}
	usageCount ++;
	
	// append new usage data to the end
	usageLogFile.seekp (0, ios::end);
	usageLogFile.write (reinterpret_cast<char *>(&cpuUsage), sizeof (int));
	
	if (RESOURCE_DEBUG)
		cout << "usage: " << usageCount << endl;
	
	// calculate peak and average if it's time to log usage data
	if (usageCount >= LOG_FREQ / MONITOR_FREQ)
	{
		usageLogFile.seekg (4, ios::beg);
		usageLogFile.read ((char*)usage, sizeof(unsigned int) * LOG_FREQ/MONITOR_FREQ); 
		if (usageLogFile.eof()) usageLogFile.clear();
		if (RESOURCE_DEBUG)
		{
			for (int i = 0; i < usageCount; i++)
			{
				cout << usage [i] << endl;
			}
		}
		serverMonitor.logCPUstat(usageCount);
		
		// delete the file
		usageLogFile.close();
		unlink (usageLogFileName.c_str());
	}
	
	// else, update usageCount
	else
	{
		usageLogFile.seekp(0, ios::beg);
		usageLogFile.write (reinterpret_cast<char *>(&usageCount), sizeof (int));
		usageLogFile.close();
	}
}
DWORD WINAPI ServerMonitor::TcpDataThread(LPVOID lParam)
{
	ServerMonitor *sm = (ServerMonitor *)WinService::GetService();
	SOCKET s_accept=(SOCKET)lParam;
	
	DWORD dwBytes;
	DWORD flags=0;
	
	WSAOVERLAPPED overlapped = {0};
	WSAEVENT hEvent = WSACreateEvent();
	overlapped.hEvent=hEvent;
	
	WSABUF wsabuf;
	char buff[MAXBUFFLEN];
	wsabuf.buf=buff;
	wsabuf.len=MAXBUFFLEN;
	WSAEVENT handles[]={sm->_hstop,hEvent};
	int retval=0;
	DWORD dwTotalBytes=strlen(COMMANDHEAD)+4+4; //plus windows number
	DWORD recvlen=0;
	DWORD exlen= -1;
	while(TRUE)
	{
		flags = 0;
		memset(&overlapped, 0, sizeof(overlapped));
		overlapped.hEvent = hEvent;
		if(WSARecv(s_accept,&wsabuf,1,&dwBytes,&flags,&overlapped,NULL)==SOCKET_ERROR)
		{
			int error=WSAGetLastError();
			if(error!=WSA_IO_PENDING)
			{
				gLogger.error("[TcpDataThread] WSARecv failed: %u", error);
				break;
			}
		}

		DWORD dwRet=WSAWaitForMultipleEvents(2, handles, FALSE, RECVTIMEOUT, FALSE);
		if(WSA_WAIT_EVENT_0 + 1 != dwRet)
		{
			if(WSA_WAIT_TIMEOUT == dwRet)
			{
				gLogger.debug("[TcpDataThread] WSARecv timeout");
				printf("[TcpDataThread] WSARecv timeout\n");
			}
			break;
		}
		if(dwRet==WSA_WAIT_FAILED)
		{
			int error=GetLastError();
			gLogger.error("[TcpDataThread] WaitForMultipleObjects failed: %u", error);
			break;
		}
		
		dwBytes=0;
		retval=WSAGetOverlappedResult(s_accept,&overlapped,&dwBytes,FALSE,&flags);
		
		if(retval==FALSE)
		{
			int error=WSAGetLastError();
			if(WSA_IO_INCOMPLETE == error)
			{
				continue;
			}
			else
			{
				gLogger.error("[TcpDataThread] WSAGetOverlappedResult failed: %u", error);
				break;
			}
		}
		
		if(dwBytes==0)
		{
			break;
		}

		
		recvlen+=dwBytes;
		wsabuf.buf=buff+recvlen;
		wsabuf.len=MAXBUFFLEN -recvlen;
		WSAResetEvent(hEvent);
		
		if(exlen==-1 && recvlen >= dwTotalBytes)
		{
			exlen = *(DWORD *)(buff+strlen(COMMANDHEAD)+4);
			dwTotalBytes+=exlen;
		}

		if(recvlen >= dwTotalBytes)
			break;
	}

	if(recvlen >= dwTotalBytes)//接收完成;
	{
		printf("recv ok!\n");
		DWORD sendLen=dwTotalBytes;
		gLogger.debug("[TcpDataThread] TotalRecvBytes %u",dwTotalBytes);
		sm->HandleCommand(buff,&sendLen);//处理完毕后,把buff发回给对方。
		gLogger.debug("[TcpDataThread] TotalSendBytes %u",sendLen);
		if(sendLen != 0)//发送返回数据
		{
			while(TRUE)
			{
				wsabuf.buf = buff;
				wsabuf.len = sendLen;
				memset(&overlapped, 0, sizeof(overlapped));
				overlapped.hEvent = hEvent;
				if(WSASend(s_accept,&wsabuf,1,&sendLen,0,&overlapped,NULL)==SOCKET_ERROR)
				{
					int error=WSAGetLastError();
					if(error!=WSA_IO_PENDING)
					{
						gLogger.error("[TcpDataThread] WSASend failed: %u", error);
						break;
					}
				}
				
				
				DWORD dwRet=WSAWaitForMultipleEvents(2, handles, FALSE, RECVTIMEOUT, FALSE);
				if(WSA_WAIT_EVENT_0 + 1 != dwRet)
				{
					if(WSA_WAIT_TIMEOUT == dwRet)
					{
						gLogger.info("[TcpDataThread] WSASend timeout");
						printf("[TcpDataThread] WSASend timeout");
					}
					break;
				}	
				if(dwRet==WSA_WAIT_FAILED)
				{
					int error=GetLastError();
					gLogger.error("[TcpDataThread] WaitForMultipleObjects failed: %u", error);
					break;
				}

				retval=WSAGetOverlappedResult(s_accept,&overlapped,&dwBytes,FALSE,&flags);
				if(retval==FALSE)
				{
					int error=WSAGetLastError();
					if(WSA_IO_INCOMPLETE == error)
					{
						RaiseException(0, 0, 0, NULL);
						continue;
					}
				}
				break;
			}
		}
	}
	else
	{
		printf("recv error\n");
		gLogger.info("[TcpDataThread] recv error");
	}
	
	WSACloseEvent(hEvent);
	gLogger.debug("[TcpDataThread] closesocket");
	//WSACloseEvent(hEvent);
	shutdown(s_accept,SD_BOTH);
	closesocket(s_accept);
	return 0;
}