/****************************************************************************************** * @brief sendResourceAlarm * * purpose: send a trap and log the process information * ******************************************************************************************/ bool ServerMonitor::sendResourceAlarm(string alarmItem, ALARMS alarmID, int action, int usage) { ServerMonitor serverMonitor; Oam oam; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add(alarmItem); args.add(" usage at percentage of "); args.add(usage); // get current module name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { moduleName = "Unknown Server"; } // check if there is an active alarm above the reporting theshold // that needs to be cleared if (alarmItem == "CPU") serverMonitor.checkCPUAlarm(alarmItem, alarmID); else if (alarmItem == "Local Disk" || alarmItem == "External") serverMonitor.checkDiskAlarm(alarmItem, alarmID); else if (alarmItem == "Local Memory") serverMonitor.checkMemoryAlarm(alarmItem, alarmID); else if (alarmItem == "Local Swap") serverMonitor.checkSwapAlarm(alarmItem, alarmID); // don't issue an alarm on thge dbroots is already issued by this or another server if ( alarmItem.find(startup::StartUp::installDir() + "/data") == 0 ) { // check if Alarm is already active from any module, don't resend if ( !( oam.checkActiveAlarm(alarmID, "*", alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } else { // check if Alarm is already active from this module, don't resend if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } return true; }
/***************************************************************************************** * @brief cpuMonitor Thread * * purpose: Get current CPU usage, average over 5 readings and report alarms * *****************************************************************************************/ void cpuMonitor() { ServerMonitor serverMonitor; // register for Heartbeat monitoring /* try { ProcHeartbeat procheartbeat; procheartbeat.registerHeartbeat(CPU_HEARTBEAT_ID); } catch (exception& ex) { string error = ex.what(); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on registerHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ int periodCount = 5; float cpuPeriod[periodCount]; int periodCounter = 0; float averageCpuUsage = 0; currentCpuUsage = 0; // set defaults unsigned int cpuCritical = 0, cpuMajor = 0, cpuMinor = 0, cpuMinorClear = 0; // initial cpu Period table for (int i =0;i < periodCount; i++) { cpuPeriod[i] = 0; } while(true) { // Get CPU usage water mark from server configuration and compare ModuleTypeConfig moduleTypeConfig; Oam oam; try { oam.getSystemConfig(moduleTypeConfig); cpuCritical = moduleTypeConfig.ModuleCPUCriticalThreshold; cpuMajor = moduleTypeConfig.ModuleCPUMajorThreshold; cpuMinor = moduleTypeConfig.ModuleCPUMinorThreshold; cpuMinorClear = moduleTypeConfig.ModuleCPUMinorClearThreshold; } catch (...) { sleep(5); continue; } if (RESOURCE_DEBUG) cout << "critical water: " << moduleTypeConfig.ModuleCPUCriticalThreshold << endl; pthread_mutex_lock(&CPU_LOCK); // // get Process and System CPU usage // serverMonitor.getCPUdata(); // store and get average cpuPeriod[periodCounter] = currentCpuUsage; averageCpuUsage = 0; for (int i =0;i < periodCount; i++) { averageCpuUsage += cpuPeriod[i]; } averageCpuUsage = averageCpuUsage / periodCount; // serverMonitor.logCPUactive(averageCpuUsage); if (CPU_DEBUG) { cout << "Current CPU Usage: " << currentCpuUsage << endl; cout << "Average CPU Usage: " << averageCpuUsage << endl; } if (averageCpuUsage >= cpuCritical && cpuCritical > 0 ) { serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_HIGH, SET, (int) averageCpuUsage); } else if (averageCpuUsage >= cpuMajor && cpuMajor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_MED, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinor && cpuMinor > 0 ) serverMonitor.sendResourceAlarm("CPU", CPU_USAGE_LOW, SET, (int) averageCpuUsage); else if (averageCpuUsage >= cpuMinorClear && cpuMinorClear > 0 ) { serverMonitor.checkCPUAlarm("CPU", CPU_USAGE_LOW); //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Current CPU usage = "); args.add((int) currentCpuUsage); args.add(", Average CPU usage = "); args.add((int) averageCpuUsage); msg.format(args); ml.logInfoMessage(msg); } else serverMonitor.checkCPUAlarm("CPU"); // // check CPU usage by process // ProcessCPUList::iterator p = pcl.begin(); while(p != pcl.end()) { string processName = (*p).processName; double cpuUsage = (*p).usedPercent; p++; if (CPU_DEBUG) { cout << "Process Name : " << processName << endl; cout << "CPU Usage: " << cpuUsage << endl; } // check if a Calpont Process, if so alarm is over thresholds // if not, just log if over thresholds if (cpuUsage >= cpuCritical && cpuCritical > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_HIGH, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Critical CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMajor && cpuMajor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_MED, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Major CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } else if (cpuUsage >= cpuMinor && cpuMinor > 0) { /* try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.sendResourceAlarm(processName, CPU_USAGE_LOW, SET, (int) cpuUsage); } catch (...) { */ LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Process"); args.add(processName); args.add(" above Minor CPU threshold with a percentage of "); args.add((int) cpuUsage); msg.format(args); ml.logInfoMessage(msg); // } } /* else if (cpuUsage >= cpuMinorClear) { try { t = oam.getMyProcessStatus(processID); processName = boost::get<1>(t); serverMonitor.checkCPUAlarm(processName, CPU_USAGE_LOW); } catch (...) {} } else serverMonitor.checkCPUAlarm(processName); */ } // send heartbeat message /* try { ProcHeartbeat procheartbeat; procheartbeat.sendHeartbeat(CPU_HEARTBEAT_ID); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Sent Heartbeat Msg"); msg.format(args); ml.logInfoMessage(msg); } catch (exception& ex) { string error = ex.what(); if ( error.find("Disabled") == string::npos ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: "); args.add(error); msg.format(args); ml.logErrorMessage(msg); } } catch(...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("EXCEPTION ERROR on sendHeartbeat: Caught unknown exception!"); msg.format(args); ml.logErrorMessage(msg); } */ pthread_mutex_unlock(&CPU_LOCK); // sleep, 5 minutes sleep(MONITOR_PERIOD*5); ++periodCounter; if ( periodCounter >= periodCount ) periodCounter = 0; } // end of while loop }