/****************************************************************************************** * @brief sendAlarm * * purpose: send a trap and log the process information * ******************************************************************************************/ void ServerMonitor::sendAlarm(string alarmItem, ALARMS alarmID, int action, float sensorValue) { ServerMonitor serverMonitor; Oam oam; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add(alarmItem); args.add(", sensor value out-of-range: "); args.add(sensorValue); // get current server name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { moduleName = "Unknown Server"; } // check if there is an active alarm above the reporting theshold // that needs to be cleared serverMonitor.checkAlarm(alarmItem, alarmID); // check if Alarm is already active, don't resend if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); } // output log msg.format(args); ml.logWarningMessage(msg); return; }
int main(int argc, char* argv[]) { ::CoInitializeEx(NULL, COINIT_MULTITHREADED); g_appPath = Linkwork::Win32::GetAppPath(); g_strconfigfile= g_appPath+ "ServerMonitor.ini"; g_strserverini = g_appPath + "Server.ini"; // g_strAlertServerini = g_appPath.substr(0,g_appPath.rfind('\\')) + ""; SetCurrentDirectory(g_appPath.c_str()); gLogger.init(g_strconfigfile.c_str(), "ServerMonitor"); #ifndef _DEBUG SetAutoMinidump(); #endif ServerMonitor sm; if ( (argc > 1) && ((*argv[1] == '-') || (*argv[1] == '/')) ) { if ( _stricmp( "debug", argv[1]+1 ) == 0 ) { //调试运行 sm.debugservice(); return TRUE; } } if(!sm.startservice("ESServerMonitor")) { gLogger.debug("服务启动失败"); } return 0; }
/****************************************************************************************** * @brief sendResourceAlarm * * purpose: send a trap and log the process information * ******************************************************************************************/ bool ServerMonitor::sendResourceAlarm(string alarmItem, ALARMS alarmID, int action, int usage) { ServerMonitor serverMonitor; Oam oam; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add(alarmItem); args.add(" usage at percentage of "); args.add(usage); // get current module name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { moduleName = "Unknown Server"; } // check if there is an active alarm above the reporting theshold // that needs to be cleared if (alarmItem == "CPU") serverMonitor.checkCPUAlarm(alarmItem, alarmID); else if (alarmItem == "Local Disk" || alarmItem == "External") serverMonitor.checkDiskAlarm(alarmItem, alarmID); else if (alarmItem == "Local Memory") serverMonitor.checkMemoryAlarm(alarmItem, alarmID); else if (alarmItem == "Local Swap") serverMonitor.checkSwapAlarm(alarmItem, alarmID); // don't issue an alarm on thge dbroots is already issued by this or another server if ( alarmItem.find(startup::StartUp::installDir() + "/data") == 0 ) { // check if Alarm is already active from any module, don't resend if ( !( oam.checkActiveAlarm(alarmID, "*", alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } else { // check if Alarm is already active from this module, don't resend if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } return true; }
/***************************************************************************************** * @brief diskMonitor Thread * * purpose: Get current Local and External disk usage and report alarms * *****************************************************************************************/ void diskMonitor() { ServerMonitor serverMonitor; Oam oam; SystemConfig systemConfig; ModuleTypeConfig moduleTypeConfig; typedef std::vector<std::string> LocalFileSystems; LocalFileSystems lfs; struct statvfs buf; // set defaults int localDiskCritical = 90, localDiskMajor = 80, localDiskMinor = 70, ExternalDiskCritical = 90, ExternalDiskMajor = 80, ExternalDiskMinor = 70; // get module types string moduleType; int moduleID=-1; string moduleName; oamModuleInfo_t t; try { t = oam.getModuleInfo(); moduleType = boost::get<1>(t); moduleID = boost::get<2>(t); moduleName = boost::get<0>(t); } catch (exception& e) {} bool Externalflag = false; //check for external disk DBrootList dbrootList; if (moduleType == "pm") { systemStorageInfo_t t; t = oam.getStorageConfig(); if ( boost::get<0>(t) == "external") Externalflag = true; // get dbroot list and storage type from config file DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbrootID = *pt; string dbroot = "DBRoot" + oam.itoa(dbrootID); string dbootdir; try{ oam.getSystemConfig(dbroot, dbootdir); } catch(...) {} if ( dbootdir.empty() || dbootdir == "" ) continue; DBrootData dbrootData; dbrootData.dbrootDir = dbootdir; dbrootData.downFlag = false; dbrootList.push_back(dbrootData); } } string cloud = oam::UnassignedName; try { oam.getSystemConfig( "Cloud", cloud); } catch(...) { cloud = oam::UnassignedName; } //get Gluster Config setting string GlusterConfig = "n"; try { oam.getSystemConfig( "GlusterConfig", GlusterConfig); } catch(...) { GlusterConfig = "n"; } int diskSpaceCheck = 0; while(true) { SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { sleep(5); continue; } // Get Local/External Disk Mount points to monitor and associated thresholds try { oam.getSystemConfig (moduleTypeConfig); localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold; localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold; localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold; DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin(); for( ; p != moduleTypeConfig.FileSystems.end() ; p++) { string fs = *p; lfs.push_back(fs); if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Config File System to monitor ="); args.add(fs); msg.format(args); ml.logDebugMessage(msg); } } } catch (...) { sleep(5); continue; } // get External info try { oam.getSystemConfig(systemConfig); } catch (...) { sleep(5); continue; } if (Externalflag) { // get External info try { ExternalDiskCritical = systemConfig.ExternalCriticalThreshold; ExternalDiskMajor = systemConfig.ExternalMajorThreshold; ExternalDiskMinor = systemConfig.ExternalMinorThreshold; } catch (...) { sleep(5); continue; } } //check for local file systems LocalFileSystems::iterator p = lfs.begin(); while(p != lfs.end()) { string deviceName = *p; ++p; string fileName; // check local if ( deviceName == "/") { fileName = deviceName + "usr/local/Calpont/releasenum"; } else { fileName = deviceName + "/000.dir"; } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else continue; int64_t diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if ( diskSpaceCheck == 0 ) { if (diskUsage >= localDiskCritical && localDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMajor && localDiskMajor > 0 ) { if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMinor && localDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } //check for external file systems/devices if (Externalflag || (!Externalflag && GlusterConfig == "y" && moduleType == "pm") ){ try { DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbroot = *pt; string deviceName = systemConfig.DBRoot[dbroot-1]; string fileName = deviceName + "/000.dir"; if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("DBRoots monitoring"); args.add(dbroot); args.add(" ,file system =" ); args.add(fileName); msg.format(args); ml.logDebugMessage(msg); } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else { SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = 0; sd.totalBlocks = 0; sd.usedBlocks = 0; sdl.push_back(sd); continue; } int diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } } catch (exception& e) { cout << endl << "**** getPmDbrootConfig Failed : " << e.what() << endl; } } } //check OAM dbroot test flag to validate dbroot exist if on pm if ( moduleName.find("pm") != string::npos ) { //check OAM dbroot test flag to validate dbroot exist if ( dbrootList.size() != 0 ) { DBrootList::iterator p = dbrootList.begin(); while ( p != dbrootList.end() ) { //get dbroot directory string dbrootDir = (*p).dbrootDir; string dbrootName; string dbrootID; //get dbroot name string::size_type pos = dbrootDir.rfind("/",80); if (pos != string::npos) dbrootName = dbrootDir.substr(pos+1,80); //get ID dbrootID = dbrootName.substr(4,80); string fileName = dbrootDir + "/OAMdbrootCheck"; // retry in case we hit the remount window for ( int retry = 0 ; ; retry++ ) { bool fail = false; //first test, check if OAMdbrootCheck exists ifstream file (fileName.c_str()); if (!file) fail = true; else { //second test for amazon, check volume status if ( cloud != oam::UnassignedName ) { string volumeNameID = "PMVolumeName" + dbrootID; string volumeName = oam::UnassignedName; try { oam.getSystemConfig( volumeNameID, volumeName); } catch(...) {} if ( volumeName.empty() || volumeName == oam::UnassignedName ) fail = false; else { string status = oam.getEC2VolumeStatus(volumeName); if ( status == "attached" ) fail = false; else { fail = true; LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Volume not attached"); args.add(volumeName); args.add("/"); args.add(dbrootName); msg.format(args); ml.logCriticalMessage(msg); } } } else fail = false; } if (fail) { //double check system status before reporting any error BUG 5078 SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { break; } if ( retry < 10 ) { sleep(3); continue; } else { if ( !(*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Lost access to "); args.add(dbrootDir); msg.format(args); ml.logCriticalMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName); (*p).downFlag = true; try{ oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE); } catch (exception& ex) {} break; } } } else { if ( (*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Access back to "); args.add(dbrootDir); msg.format(args); ml.logInfoMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName); (*p).downFlag = false; try{ oam.setDbrootStatus(dbrootID, oam::ACTIVE); } catch (exception& ex) {} } file.close(); break; } } p++; } } } //do Gluster status check, if configured if ( GlusterConfig == "y") { bool pass = true; string errmsg = "unknown"; try { string arg1 = ""; string arg2 = ""; int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg); if ( ret != 0 ) { cerr << "FAILURE: Status check error: " + errmsg << endl; pass = false; } } catch (exception& e) { cerr << endl << "**** glusterctl API exception: " << e.what() << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } catch (...) { cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } if ( !pass ) { // issue log and alarm LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Gluster Status check failure error msg: "); args.add(errmsg); msg.format(args); ml.logWarningMessage(msg); serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0); } } // sleep 10 seconds sleep(MONITOR_PERIOD/6); //check disk space every 10 minutes diskSpaceCheck++; if ( diskSpaceCheck >= 60 ) diskSpaceCheck = 0; lfs.clear(); sdl.clear(); } // end of while loop }
void hardwareMonitor(int IPMI_SUPPORT) { ServerMonitor serverMonitor; string data[10]; string SensorName; float SensorValue; string Units; string SensorStatus; float lowFatal; float lowCritical; float lowWarning; float highWarning; float highCritical; float highFatal; char *p; if( IPMI_SUPPORT == 0) { int returnCode = system("ipmitool sensor list > /tmp/harwareMonitor.txt"); if (returnCode) { // System error, Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Error running ipmitool sensor list!!!"); msg.format(args); ml.logWarningMessage(msg); while(TRUE) sleep(10000); } } else { while(TRUE) sleep(10000); } // register for Heartbeat monitoring /* try { ProcHeartbeat procheartbeat; procheartbeat.registerHeartbeat(HW_HEARTBEAT_ID); } catch (exception& ex) { string error = ex.what(); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on registerHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ // loop forever reading the hardware status while(TRUE) { // parse output file ifstream File ("/tmp/harwareMonitor.txt"); if (!File){ // System error, Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Error opening /tmp/harwareMonitor.txt!!!"); msg.format(args); ml.logWarningMessage(msg); sleep(300); continue; } char line[200]; while (File.getline(line, 200)) { // parse the line int f = 0; p = strtok(line,"|"); while (p) { data[f]=p; data[f] = serverMonitor.StripWhitespace(data[f]); p = strtok (NULL, "|"); f++; } if( f == 0 ) // nothing on this line, skip continue; SensorName = data[0]; SensorValue = atof(data[1].c_str()); Units = data[2]; SensorStatus = data[3]; lowFatal = atof(data[4].c_str()); lowCritical = atof(data[5].c_str()); lowWarning = atof(data[6].c_str()); highWarning = atof(data[7].c_str()); highCritical = atof(data[8].c_str()); highFatal = atof(data[9].c_str()); // check status and issue apporiate alarm if needed if ( (SensorStatus != "ok") && (SensorStatus != "nr") && (SensorStatus != "na") ) { // Status error, check for warning or critical levels if ( SensorValue >= highFatal ) { // issue critical alarm and send message to shutdown Server serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue); serverMonitor.sendMsgShutdownServer(); } else if ( (SensorValue < highFatal) && (SensorValue >= highCritical) ) // issue major alarm serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue); else if ( (SensorValue < highCritical ) && (SensorValue >= highWarning) ) // issue minor alarm serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue); else if ( (SensorValue <= lowWarning) && (SensorValue > lowCritical) ) // issue minor alarm serverMonitor.sendAlarm(SensorName, HARDWARE_LOW, SET, SensorValue); else if ( (SensorValue <= lowCritical) && (SensorValue > lowFatal) ) // issue major alarm serverMonitor.sendAlarm(SensorName, HARDWARE_MED, SET, SensorValue); else if ( SensorValue <= lowFatal ) { // issue critical alarm and send message to shutdown Server serverMonitor.sendAlarm(SensorName, HARDWARE_HIGH, SET, SensorValue); serverMonitor.sendMsgShutdownServer(); } else // check if there are any active alarms that needs to be cleared serverMonitor.checkAlarm(SensorName); } else // check if there are any active alarms that needs to be cleared serverMonitor.checkAlarm(SensorName); } //end of parsing file while File.close(); // send heartbeat message /* try { ProcHeartbeat procheartbeat; procheartbeat.sendHeartbeat(HW_HEARTBEAT_ID); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Sent Heartbeat Msg"); msg.format(args); ml.logDebugMessage(msg); } catch (exception& ex) { string error = ex.what(); if ( error.find("Disabled") == string::npos ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ // sleep sleep(MONITOR_PERIOD); } //end of forever while loop }
/***************************************************************************************** * @brief cpuMonitor Thread * * purpose: Get current CPU usage, average over 5 readings and report alarms * *****************************************************************************************/ void cpuMonitor() { ServerMonitor serverMonitor; // register for Heartbeat monitoring /* try { ProcHeartbeat procheartbeat; procheartbeat.registerHeartbeat(CPU_HEARTBEAT_ID); } catch (exception& ex) { string error = ex.what(); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on registerHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ int periodCount = 5; float cpuPeriod[periodCount]; int periodCounter = 0; float averageCpuUsage = 0; currentCpuUsage = 0; // set defaults unsigned int cpuCritical = 0, cpuMajor = 0, cpuMinor = 0, cpuMinorClear = 0; // initial cpu Period table for (int i =0;i < periodCount; i++) { cpuPeriod[i] = 0; } while(true) { // Get CPU usage water mark from server configuration and compare ModuleTypeConfig moduleTypeConfig; Oam oam; try { oam.getSystemConfig(moduleTypeConfig); cpuCritical = moduleTypeConfig.ModuleCPUCriticalThreshold; cpuMajor = moduleTypeConfig.ModuleCPUMajorThreshold; cpuMinor = moduleTypeConfig.ModuleCPUMinorThreshold; cpuMinorClear = moduleTypeConfig.ModuleCPUMinorClearThreshold; } catch (...) { sleep(5); continue; } if (RESOURCE_DEBUG) cout << "critical water: " << moduleTypeConfig.ModuleCPUCriticalThreshold << endl; pthread_mutex_lock(&CPU_LOCK); // // get Process and System CPU usage // serverMonitor.getCPUdata(); // store and get average cpuPeriod[periodCounter] = currentCpuUsage; averageCpuUsage = 0; for (int i =0;i < periodCount; i++) { averageCpuUsage += cpuPeriod[i]; } averageCpuUsage = averageCpuUsage / periodCount; // serverMonitor.logCPUactive(averageCpuUsage); if (CPU_DEBUG) { cout << "Current CPU Usage: " << currentCpuUsage << endl; cout << "Average CPU Usage: " << averageCpuUsage << endl; } if (averageCpuUsage >= cpuCritical && cpuCritical > 0 ) { serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_HIGH, SET, (int) averageCpuUsage); } else if (averageCpuUsage >= cpuMajor && cpuMajor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_MED, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinor && cpuMinor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_LOW, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinorClear && cpuMinorClear > 0 ) { serverMonitor.checkCPUAlarm("CPU", CPU_USAGE_LOW); //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Current CPU usage = "); args.add((int) currentCpuUsage); args.add(", Average CPU usage = "); args.add((int) averageCpuUsage); msg.format(args); ml.logInfoMessage(msg); } else serverMonitor.checkCPUAlarm("CPU"); // // check CPU usage by process // ProcessCPUList::iterator p = pcl.begin(); while(p != pcl.end()) { string processName = (*p).processName; double cpuUsage = (*p).usedPercent; p++; if (CPU_DEBUG) { cout << "Process Name : " << processName << endl; cout << "CPU Usage: " << cpuUsage << endl; } // check if a Calpont Process, if so alarm is over thresholds // if not, just log if over thresholds if (cpuUsage >= cpuCritical && cpuCritical > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_HIGH, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Critical CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMajor && cpuMajor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_MED, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Major CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMinor && cpuMinor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_LOW, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Minor CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } /* else if (cpuUsage >= cpuMinorClear) { try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.checkCPUAlarm(processName, CPU_USAGE_LOW); } catch (...) {} } else serverMonitor.checkCPUAlarm(processName); */ } // send heartbeat message /* try { ProcHeartbeat procheartbeat; procheartbeat.sendHeartbeat(CPU_HEARTBEAT_ID); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Sent Heartbeat Msg"); msg.format(args); ml.logInfoMessage(msg); } catch (exception& ex) { string error = ex.what(); if ( error.find("Disabled") == string::npos ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ pthread_mutex_unlock(&CPU_LOCK); // sleep, 5 minutes sleep(MONITOR_PERIOD*5); ++periodCounter; if ( periodCounter >= periodCount ) periodCounter = 0; } // end of while loop }
/***************************************************************************************** * @brief logCPUactive * * purpose: Log Peak and Average CPU usage * *****************************************************************************************/ void ServerMonitor::logCPUactive (unsigned int cpuUsage) { ServerMonitor serverMonitor; // determin the active log file name string usageLogFileName = FE_MOUNT_DIR; usageLogFileName = usageLogFileName + "cpu.log"; if (RESOURCE_DEBUG) cout << usageLogFileName << endl; fstream usageLogFile; usageLogFile.open (usageLogFileName.c_str(), ios::in|ios::out); if (usageLogFile.fail()) { ofstream file (usageLogFileName.c_str()); file.close(); usageLogFile.open(usageLogFileName.c_str(), ios::in|ios::out); if (!usageLogFile) cout << "--error" << endl; } // get the counter usageLogFile.seekg(0, ios::beg); usageLogFile.read (reinterpret_cast<char *>(&usageCount), sizeof (int)); if (usageLogFile.eof()) usageLogFile.clear(); // new iteration if (usageCount == 0) { usageLogFile.seekp(0, ios::beg); usageLogFile.write (reinterpret_cast<char *>(&usageCount), sizeof (int)); } usageCount ++; // append new usage data to the end usageLogFile.seekp (0, ios::end); usageLogFile.write (reinterpret_cast<char *>(&cpuUsage), sizeof (int)); if (RESOURCE_DEBUG) cout << "usage: " << usageCount << endl; // calculate peak and average if it's time to log usage data if (usageCount >= LOG_FREQ / MONITOR_FREQ) { usageLogFile.seekg (4, ios::beg); usageLogFile.read ((char*)usage, sizeof(unsigned int) * LOG_FREQ/MONITOR_FREQ); if (usageLogFile.eof()) usageLogFile.clear(); if (RESOURCE_DEBUG) { for (int i = 0; i < usageCount; i++) { cout << usage [i] << endl; } } serverMonitor.logCPUstat(usageCount); // delete the file usageLogFile.close(); unlink (usageLogFileName.c_str()); } // else, update usageCount else { usageLogFile.seekp(0, ios::beg); usageLogFile.write (reinterpret_cast<char *>(&usageCount), sizeof (int)); usageLogFile.close(); } }
DWORD WINAPI ServerMonitor::TcpDataThread(LPVOID lParam) { ServerMonitor *sm = (ServerMonitor *)WinService::GetService(); SOCKET s_accept=(SOCKET)lParam; DWORD dwBytes; DWORD flags=0; WSAOVERLAPPED overlapped = {0}; WSAEVENT hEvent = WSACreateEvent(); overlapped.hEvent=hEvent; WSABUF wsabuf; char buff[MAXBUFFLEN]; wsabuf.buf=buff; wsabuf.len=MAXBUFFLEN; WSAEVENT handles[]={sm->_hstop,hEvent}; int retval=0; DWORD dwTotalBytes=strlen(COMMANDHEAD)+4+4; //plus windows number DWORD recvlen=0; DWORD exlen= -1; while(TRUE) { flags = 0; memset(&overlapped, 0, sizeof(overlapped)); overlapped.hEvent = hEvent; if(WSARecv(s_accept,&wsabuf,1,&dwBytes,&flags,&overlapped,NULL)==SOCKET_ERROR) { int error=WSAGetLastError(); if(error!=WSA_IO_PENDING) { gLogger.error("[TcpDataThread] WSARecv failed: %u", error); break; } } DWORD dwRet=WSAWaitForMultipleEvents(2, handles, FALSE, RECVTIMEOUT, FALSE); if(WSA_WAIT_EVENT_0 + 1 != dwRet) { if(WSA_WAIT_TIMEOUT == dwRet) { gLogger.debug("[TcpDataThread] WSARecv timeout"); printf("[TcpDataThread] WSARecv timeout\n"); } break; } if(dwRet==WSA_WAIT_FAILED) { int error=GetLastError(); gLogger.error("[TcpDataThread] WaitForMultipleObjects failed: %u", error); break; } dwBytes=0; retval=WSAGetOverlappedResult(s_accept,&overlapped,&dwBytes,FALSE,&flags); if(retval==FALSE) { int error=WSAGetLastError(); if(WSA_IO_INCOMPLETE == error) { continue; } else { gLogger.error("[TcpDataThread] WSAGetOverlappedResult failed: %u", error); break; } } if(dwBytes==0) { break; } recvlen+=dwBytes; wsabuf.buf=buff+recvlen; wsabuf.len=MAXBUFFLEN -recvlen; WSAResetEvent(hEvent); if(exlen==-1 && recvlen >= dwTotalBytes) { exlen = *(DWORD *)(buff+strlen(COMMANDHEAD)+4); dwTotalBytes+=exlen; } if(recvlen >= dwTotalBytes) break; } if(recvlen >= dwTotalBytes)//接收完成; { printf("recv ok!\n"); DWORD sendLen=dwTotalBytes; gLogger.debug("[TcpDataThread] TotalRecvBytes %u",dwTotalBytes); sm->HandleCommand(buff,&sendLen);//处理完毕后,把buff发回给对方。 gLogger.debug("[TcpDataThread] TotalSendBytes %u",sendLen); if(sendLen != 0)//发送返回数据 { while(TRUE) { wsabuf.buf = buff; wsabuf.len = sendLen; memset(&overlapped, 0, sizeof(overlapped)); overlapped.hEvent = hEvent; if(WSASend(s_accept,&wsabuf,1,&sendLen,0,&overlapped,NULL)==SOCKET_ERROR) { int error=WSAGetLastError(); if(error!=WSA_IO_PENDING) { gLogger.error("[TcpDataThread] WSASend failed: %u", error); break; } } DWORD dwRet=WSAWaitForMultipleEvents(2, handles, FALSE, RECVTIMEOUT, FALSE); if(WSA_WAIT_EVENT_0 + 1 != dwRet) { if(WSA_WAIT_TIMEOUT == dwRet) { gLogger.info("[TcpDataThread] WSASend timeout"); printf("[TcpDataThread] WSASend timeout"); } break; } if(dwRet==WSA_WAIT_FAILED) { int error=GetLastError(); gLogger.error("[TcpDataThread] WaitForMultipleObjects failed: %u", error); break; } retval=WSAGetOverlappedResult(s_accept,&overlapped,&dwBytes,FALSE,&flags); if(retval==FALSE) { int error=WSAGetLastError(); if(WSA_IO_INCOMPLETE == error) { RaiseException(0, 0, 0, NULL); continue; } } break; } } } else { printf("recv error\n"); gLogger.info("[TcpDataThread] recv error"); } WSACloseEvent(hEvent); gLogger.debug("[TcpDataThread] closesocket"); //WSACloseEvent(hEvent); shutdown(s_accept,SD_BOTH); closesocket(s_accept); return 0; }