/****************************************************************************************** * @brief sendAlarm * * purpose: send a trap and log the process information * ******************************************************************************************/ void ServerMonitor::sendAlarm(string alarmItem, ALARMS alarmID, int action, float sensorValue) { ServerMonitor serverMonitor; Oam oam; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add(alarmItem); args.add(", sensor value out-of-range: "); args.add(sensorValue); // get current server name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { moduleName = "Unknown Server"; } // check if there is an active alarm above the reporting theshold // that needs to be cleared serverMonitor.checkAlarm(alarmItem, alarmID); // check if Alarm is already active, don't resend if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); } // output log msg.format(args); ml.logWarningMessage(msg); return; }
/****************************************************************************************** * @brief checkDiskAlarm * * purpose: check to see if an alarm(s) is set on Disk and clear if so * ******************************************************************************************/ void ServerMonitor::checkDiskAlarm(string alarmItem, ALARMS alarmID) { Oam oam; ServerMonitor serverMonitor; // get current server name string serverName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); serverName = boost::get<0>(st); } catch (...) { serverName = "Unknown Server"; } switch (alarmID) { case NO_ALARM: // clear all alarms set if any found if ( oam.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); if ( oam.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_MED); if ( oam.checkActiveAlarm(DISK_USAGE_LOW, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_LOW); break; case DISK_USAGE_LOW: // clear high and medium alarms set if any found if ( oam.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); if ( oam.checkActiveAlarm(DISK_USAGE_MED, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_MED); break; case DISK_USAGE_MED: // clear high alarms set if any found if ( oam.checkActiveAlarm(DISK_USAGE_HIGH, serverName, alarmItem) ) // alarm set, clear it clearAlarm(alarmItem, DISK_USAGE_HIGH); break; default: // none to clear break; } // end of switch return; }
/****************************************************************************************** * @brief sendResourceAlarm * * purpose: send a trap and log the process information * ******************************************************************************************/ bool ServerMonitor::sendResourceAlarm(string alarmItem, ALARMS alarmID, int action, int usage) { ServerMonitor serverMonitor; Oam oam; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add(alarmItem); args.add(" usage at percentage of "); args.add(usage); // get current module name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { moduleName = "Unknown Server"; } // check if there is an active alarm above the reporting theshold // that needs to be cleared if (alarmItem == "CPU") serverMonitor.checkCPUAlarm(alarmItem, alarmID); else if (alarmItem == "Local Disk" || alarmItem == "External") serverMonitor.checkDiskAlarm(alarmItem, alarmID); else if (alarmItem == "Local Memory") serverMonitor.checkMemoryAlarm(alarmItem, alarmID); else if (alarmItem == "Local Swap") serverMonitor.checkSwapAlarm(alarmItem, alarmID); // don't issue an alarm on thge dbroots is already issued by this or another server if ( alarmItem.find(startup::StartUp::installDir() + "/data") == 0 ) { // check if Alarm is already active from any module, don't resend if ( !( oam.checkActiveAlarm(alarmID, "*", alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } else { // check if Alarm is already active from this module, don't resend if ( !( oam.checkActiveAlarm(alarmID, moduleName, alarmItem)) ) { SNMPManager alarmMgr; // send alarm alarmMgr.sendAlarmReport(alarmItem.c_str(), alarmID, action); args.add(", Alarm set: "); args.add(alarmID); msg.format(args); ml.logInfoMessage(msg); return true; } else return false; } return true; }
/***************************************************************************************** * @brief diskMonitor Thread * * purpose: Get current Local and External disk usage and report alarms * *****************************************************************************************/ void diskMonitor() { ServerMonitor serverMonitor; Oam oam; SystemConfig systemConfig; ModuleTypeConfig moduleTypeConfig; typedef std::vector<std::string> LocalFileSystems; LocalFileSystems lfs; struct statvfs buf; // set defaults int localDiskCritical = 90, localDiskMajor = 80, localDiskMinor = 70, ExternalDiskCritical = 90, ExternalDiskMajor = 80, ExternalDiskMinor = 70; // get module types string moduleType; int moduleID=-1; string moduleName; oamModuleInfo_t t; try { t = oam.getModuleInfo(); moduleType = boost::get<1>(t); moduleID = boost::get<2>(t); moduleName = boost::get<0>(t); } catch (exception& e) {} bool Externalflag = false; //check for external disk DBrootList dbrootList; if (moduleType == "pm") { systemStorageInfo_t t; t = oam.getStorageConfig(); if ( boost::get<0>(t) == "external") Externalflag = true; // get dbroot list and storage type from config file DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbrootID = *pt; string dbroot = "DBRoot" + oam.itoa(dbrootID); string dbootdir; try{ oam.getSystemConfig(dbroot, dbootdir); } catch(...) {} if ( dbootdir.empty() || dbootdir == "" ) continue; DBrootData dbrootData; dbrootData.dbrootDir = dbootdir; dbrootData.downFlag = false; dbrootList.push_back(dbrootData); } } string cloud = oam::UnassignedName; try { oam.getSystemConfig( "Cloud", cloud); } catch(...) { cloud = oam::UnassignedName; } //get Gluster Config setting string GlusterConfig = "n"; try { oam.getSystemConfig( "GlusterConfig", GlusterConfig); } catch(...) { GlusterConfig = "n"; } int diskSpaceCheck = 0; while(true) { SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { sleep(5); continue; } // Get Local/External Disk Mount points to monitor and associated thresholds try { oam.getSystemConfig (moduleTypeConfig); localDiskCritical = moduleTypeConfig.ModuleDiskCriticalThreshold; localDiskMajor = moduleTypeConfig.ModuleDiskMajorThreshold; localDiskMinor = moduleTypeConfig.ModuleDiskMinorThreshold; DiskMonitorFileSystems::iterator p = moduleTypeConfig.FileSystems.begin(); for( ; p != moduleTypeConfig.FileSystems.end() ; p++) { string fs = *p; lfs.push_back(fs); if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Config File System to monitor ="); args.add(fs); msg.format(args); ml.logDebugMessage(msg); } } } catch (...) { sleep(5); continue; } // get External info try { oam.getSystemConfig(systemConfig); } catch (...) { sleep(5); continue; } if (Externalflag) { // get External info try { ExternalDiskCritical = systemConfig.ExternalCriticalThreshold; ExternalDiskMajor = systemConfig.ExternalMajorThreshold; ExternalDiskMinor = systemConfig.ExternalMinorThreshold; } catch (...) { sleep(5); continue; } } //check for local file systems LocalFileSystems::iterator p = lfs.begin(); while(p != lfs.end()) { string deviceName = *p; ++p; string fileName; // check local if ( deviceName == "/") { fileName = deviceName + "usr/local/Calpont/releasenum"; } else { fileName = deviceName + "/000.dir"; } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else continue; int64_t diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if ( diskSpaceCheck == 0 ) { if (diskUsage >= localDiskCritical && localDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, (int) diskUsage) ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMajor && localDiskMajor > 0 ) { if (serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= localDiskMinor && localDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, (int) diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Local Disk above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } //check for external file systems/devices if (Externalflag || (!Externalflag && GlusterConfig == "y" && moduleType == "pm") ){ try { DBRootConfigList dbrootConfigList; oam.getPmDbrootConfig(moduleID, dbrootConfigList); DBRootConfigList::iterator pt = dbrootConfigList.begin(); for( ; pt != dbrootConfigList.end() ; pt++) { int dbroot = *pt; string deviceName = systemConfig.DBRoot[dbroot-1]; string fileName = deviceName + "/000.dir"; if (DISK_DEBUG) { //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("DBRoots monitoring"); args.add(dbroot); args.add(" ,file system =" ); args.add(fileName); msg.format(args); ml.logDebugMessage(msg); } uint64_t totalBlocks; uint64_t usedBlocks; if (!statvfs(fileName.c_str(), &buf)) { uint64_t blksize, blocks, freeblks, free; blksize = buf.f_bsize; blocks = buf.f_blocks; freeblks = buf.f_bfree; totalBlocks = blocks * blksize; free = freeblks * blksize; usedBlocks = totalBlocks - free; } else { SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = 0; sd.totalBlocks = 0; sd.usedBlocks = 0; sdl.push_back(sd); continue; } int diskUsage = 0; if ( totalBlocks == 0 ) { diskUsage = 0; //Log this event LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Total Disk Usage is set to 0"); msg.format(args); ml.logWarningMessage(msg); } else diskUsage = (usedBlocks / (totalBlocks / 100)) + 1; SMSystemDisk sd; sd.deviceName = deviceName; sd.usedPercent = diskUsage; sd.totalBlocks = totalBlocks; sd.usedBlocks = usedBlocks; sdl.push_back(sd); if (DISK_DEBUG) cout << "Disk Usage for " << deviceName << " is " << diskUsage << endl; if (diskUsage >= ExternalDiskCritical && ExternalDiskCritical > 0 ) { //adjust if over 100% if ( diskUsage > 100 ) diskUsage = 100; if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_HIGH, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Critical Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMajor && ExternalDiskMajor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_MED, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Major Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else if (diskUsage >= ExternalDiskMinor && ExternalDiskMinor > 0 ) { if ( serverMonitor.sendResourceAlarm(deviceName, DISK_USAGE_LOW, SET, diskUsage)) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Disk usage for"); args.add(deviceName); args.add(" above Minor Disk threshold with a percentage of "); args.add((int) diskUsage); msg.format(args); ml.logInfoMessage(msg); } } else serverMonitor.checkDiskAlarm(deviceName); } } catch (exception& e) { cout << endl << "**** getPmDbrootConfig Failed : " << e.what() << endl; } } } //check OAM dbroot test flag to validate dbroot exist if on pm if ( moduleName.find("pm") != string::npos ) { //check OAM dbroot test flag to validate dbroot exist if ( dbrootList.size() != 0 ) { DBrootList::iterator p = dbrootList.begin(); while ( p != dbrootList.end() ) { //get dbroot directory string dbrootDir = (*p).dbrootDir; string dbrootName; string dbrootID; //get dbroot name string::size_type pos = dbrootDir.rfind("/",80); if (pos != string::npos) dbrootName = dbrootDir.substr(pos+1,80); //get ID dbrootID = dbrootName.substr(4,80); string fileName = dbrootDir + "/OAMdbrootCheck"; // retry in case we hit the remount window for ( int retry = 0 ; ; retry++ ) { bool fail = false; //first test, check if OAMdbrootCheck exists ifstream file (fileName.c_str()); if (!file) fail = true; else { //second test for amazon, check volume status if ( cloud != oam::UnassignedName ) { string volumeNameID = "PMVolumeName" + dbrootID; string volumeName = oam::UnassignedName; try { oam.getSystemConfig( volumeNameID, volumeName); } catch(...) {} if ( volumeName.empty() || volumeName == oam::UnassignedName ) fail = false; else { string status = oam.getEC2VolumeStatus(volumeName); if ( status == "attached" ) fail = false; else { fail = true; LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Volume not attached"); args.add(volumeName); args.add("/"); args.add(dbrootName); msg.format(args); ml.logCriticalMessage(msg); } } } else fail = false; } if (fail) { //double check system status before reporting any error BUG 5078 SystemStatus systemstatus; try { oam.getSystemStatus(systemstatus); } catch (exception& ex) {} if (systemstatus.SystemOpState != oam::ACTIVE ) { break; } if ( retry < 10 ) { sleep(3); continue; } else { if ( !(*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Lost access to "); args.add(dbrootDir); msg.format(args); ml.logCriticalMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_DOWN, moduleName); (*p).downFlag = true; try{ oam.setDbrootStatus(dbrootID, oam::AUTO_OFFLINE); } catch (exception& ex) {} break; } } } else { if ( (*p).downFlag ) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("dbroot monitoring: Access back to "); args.add(dbrootDir); msg.format(args); ml.logInfoMessage(msg); oam.sendDeviceNotification(dbrootName, DBROOT_UP, moduleName); (*p).downFlag = false; try{ oam.setDbrootStatus(dbrootID, oam::ACTIVE); } catch (exception& ex) {} } file.close(); break; } } p++; } } } //do Gluster status check, if configured if ( GlusterConfig == "y") { bool pass = true; string errmsg = "unknown"; try { string arg1 = ""; string arg2 = ""; int ret = oam.glusterctl(oam::GLUSTER_STATUS, arg1, arg2, errmsg); if ( ret != 0 ) { cerr << "FAILURE: Status check error: " + errmsg << endl; pass = false; } } catch (exception& e) { cerr << endl << "**** glusterctl API exception: " << e.what() << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } catch (...) { cerr << endl << "**** glusterctl API exception: UNKNOWN" << endl; cerr << "FAILURE: Status check error" << endl; pass = false; } if ( !pass ) { // issue log and alarm LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Gluster Status check failure error msg: "); args.add(errmsg); msg.format(args); ml.logWarningMessage(msg); serverMonitor.sendResourceAlarm(errmsg, GLUSTER_DISK_FAILURE, SET, 0); } } // sleep 10 seconds sleep(MONITOR_PERIOD/6); //check disk space every 10 minutes diskSpaceCheck++; if ( diskSpaceCheck >= 60 ) diskSpaceCheck = 0; lfs.clear(); sdl.clear(); } // end of while loop }
/***************************************************************************************** * @brief setSNMPModuleName API * * purpose: Set SNMP Module name in the snmpdx.conf file * *****************************************************************************************/ void SNMPManager::setSNMPModuleName () { // get current Module name Oam oam; string ModuleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); ModuleName = boost::get<0>(st); } catch (...) { ModuleName = "Unknown Report Module"; } string agentName = SUB_AGENT; string fileName; makeFileName (agentName, fileName); vector <string> lines; ifstream oldFile (fileName.c_str()); if (!oldFile) throw runtime_error ("No configuration file found"); char line[200]; string buf; string newLine; string newLine1; string delimiters = " "; while (oldFile.getline(line, 200)) { buf = line; string::size_type pos = buf.find("ModuleNameStub",0); if (pos != string::npos) { newLine = buf.substr(0, pos); newLine.append(ModuleName); string::size_type pos1 = buf.find("|",pos); if (pos1 != string::npos) { newLine1 = buf.substr(pos1, 200); newLine.append(newLine1); } buf = newLine; } // output to temp file lines.push_back(buf); } oldFile.close(); unlink (fileName.c_str()); ofstream newFile (fileName.c_str()); // create new file int fd = open(fileName.c_str(), O_RDWR|O_CREAT, 0666); // Aquire an exclusive lock if (flock(fd,LOCK_EX) == -1) { throw runtime_error ("Lock SNMP configuration file error"); } copy(lines.begin(), lines.end(), ostream_iterator<string>(newFile, "\n")); newFile.close(); // Release lock if (flock(fd,LOCK_UN) == -1) { throw runtime_error ("Release lock SNMP configuration file error"); } close(fd); }
int main(int argc, char** argv) { int c; string pname(argv[0]); bool vflg = false; bool dflg = false; bool xflg = false; string configFile; opterr = 0; while ((c = getopt(argc, argv, "c:vdxh")) != EOF) switch (c) { case 'v': vflg = true; break; case 'd': dflg = true; break; case 'c': configFile = optarg; break; case 'x': xflg = true; break; case 'h': case '?': default: usage(pname); return (c == 'h' ? 0 : 1); break; } if ((argc - optind) < 3) { usage(pname); return 1; } #ifdef COMMUNITY_KEYRANGE //No OAM in CE... dflg = true; #endif Oam oam; oamModuleInfo_t t; bool parentOAMModuleFlag = true; string parentOAMModule = " "; int serverInstallType = oam::INSTALL_COMBINE_DM_UM_PM; //get local module info; validate running on Active Parent OAM Module try { t = oam.getModuleInfo(); parentOAMModuleFlag = boost::get<4>(t); parentOAMModule = boost::get<3>(t); serverInstallType = boost::get<5>(t); } catch (exception&) { parentOAMModuleFlag = true; } if (!dflg && !parentOAMModuleFlag) { cerr << "Exiting, setConfig can only be run on the Active " "OAM Parent Module '" << parentOAMModule << "'" << endl; return 2; } Config* cf; if (configFile.length() > 0) cf = Config::makeConfig(configFile); else cf = Config::makeConfig(); if (vflg) cout << "Using config file: " << cf->configFile() << endl; if (xflg) cf->delConfig(argv[optind + 0], argv[optind + 1]); else cf->setConfig(argv[optind + 0], argv[optind + 1], argv[optind + 2]); cf->write(); if (dflg || serverInstallType == oam::INSTALL_COMBINE_DM_UM_PM) return 0; //get number of pms string count = cf->getConfig("PrimitiveServers", "Count"); try { oam.distributeConfigFile(); //sleep to give time for change to be distributed sleep(atoi(count.c_str())); } catch (...) { return 1; } return 0; }
int main(int argc, char *argv[]) { Oam oam; string installDir(startup::StartUp::installDir()); Config* sysConfig = Config::makeConfig(); string SystemSection = "SystemConfig"; string InstallSection = "Installation"; bool HARDWARE = false; bool SOFTWARE = false; bool CONFIG = false; bool DBMS = false; bool RESOURCE = false; bool LOG = false; bool BULKLOG = false; bool HADOOP = false; //get current time and date time_t now; now = time(NULL); struct tm tm; localtime_r(&now, &tm); char timestamp[200]; strftime (timestamp, 200, "%m:%d:%y-%H:%M:%S", &tm); currentDate = timestamp; char helpArg[3] = "-h"; // Get System Name try{ oam.getSystemConfig("SystemName", systemName); } catch(...) { systemName = "unassigned"; } // get Local Module Name and Server Install Indicator string singleServerInstall; oamModuleInfo_t st; try { st = oam.getModuleInfo(); localModule = boost::get<0>(st); } catch (...) { cout << endl << "**** Failed : Failed to read Local Module Name" << endl; exit(-1); } try{ oam.getSystemConfig("SingleServerInstall", singleServerInstall); } catch(...) { singleServerInstall = "y"; } if (argc == 1) { argv[1] = &helpArg[0]; argc = 2; } string DataFilePlugin; try{ DataFilePlugin = sysConfig->getConfig(SystemSection, "DataFilePlugin"); } catch(...) { cout << "ERROR: Problem accessing InfiniDB configuration file" << endl; exit(-1); } for( int i = 1; i < argc; i++ ) { if( string("-h") == argv[i] ) { cout << endl; cout << "'calpontSupport' generates a Set of System Support Report Files in a tar file" << endl; cout << "called calpontSupportReport.'system-name'.tar.gz in the local directory." << endl; cout << "It should be run on the server with the DBRM front-end." << endl; cout << "Check the Admin Guide for additional information." << endl; cout << endl; cout << "Usage: calpontSupport [-h][-a][-hw][-s][-c][-db][-r][-l][-bl][-lc][-p 'root-password'][-mp 'mysql-root-password'][-de]"; // if hdfs set up print the hadoop option if (!DataFilePlugin.empty()) cout << "[-hd]"; cout << endl; cout << " -h help" << endl; cout << " -a Output all Reports (excluding Bulk Logs Reports)" << endl; cout << " -hw Output Hardware Reports only" << endl; cout << " -s Output Software Reports only" << endl; cout << " -c Output Configuration/Status Reports only" << endl; cout << " -db Output DBMS Reports only" << endl; cout << " -r Output Resource Reports only" << endl; cout << " -l Output Calpont Log/Alarms Reports only" << endl; cout << " -bl Output Calpont Bulk Log Reports only" << endl; cout << " -lc Output Reports for Local Server only" << endl; cout << " -p password (multi-server systems), root-password or 'ssh' to use 'ssh keys'" << endl; cout << " -mp mysql root user password" << endl; cout << " -de Debug Flag" << endl; // if hdfs set up print the hadoop option if (!DataFilePlugin.empty()) cout << " -hd Output hadoop reports only" << endl; exit (0); } else { if( string("-a") == argv[i] ) { HARDWARE = true; SOFTWARE = true; CONFIG = true; DBMS = true; RESOURCE = true; LOG = true; HADOOP = (DataFilePlugin.empty()? false : true); } else if( string("-hw") == argv[i] ) HARDWARE = true; else if( string("-s") == argv[i] ) SOFTWARE = true; else if( string("-c") == argv[i] ) CONFIG = true; else if( string("-db") == argv[i] ) DBMS = true; else if( string("-r") == argv[i] ) RESOURCE = true; else if( string("-l") == argv[i] ) LOG = true; else if( string("-bl") == argv[i] ) BULKLOG = true; else if( string("-lc") == argv[i] ) LOCAL = true; else if( string("-p") == argv[i] ) { i++; if ( argc == i ) { cout << "ERROR: missing root password argument" << endl; exit(-1); } rootPassword = argv[i]; //add single quote for special characters if ( rootPassword != "ssh" ) { rootPassword = "******" + rootPassword + "'"; } } else if( string("-mp") == argv[i] ) { i++; if ( argc == i ) { cout << "ERROR: missing mysql root user password argument" << endl; exit(-1); } mysqlpw = argv[i]; mysqlpw = "'" + mysqlpw + "'"; } else if( string("-de") == argv[i] ) debug_flag = "1"; else if ( string("-hd") == argv[i] ) { HADOOP = (DataFilePlugin.empty()? false : true); } else { cout << "Invalid Option of '" << argv[i] << "', run with '-h' for help" << endl; exit (1); } } } //default to -a if nothing is set if ( !HARDWARE && !SOFTWARE && !CONFIG && !DBMS && !RESOURCE && !LOG && !BULKLOG && !HADOOP) { HARDWARE = true; SOFTWARE = true; CONFIG = true; DBMS = true; RESOURCE = true; LOG = true; HADOOP = (DataFilePlugin.empty()? false : true); } //get Parent OAM Module Name and setup of it's Custom OS files string PrimaryUMModuleName; try{ PrimaryUMModuleName = sysConfig->getConfig(SystemSection, "PrimaryUMModuleName"); } catch(...) { cout << "ERROR: Problem getting Parent OAM Module Name" << endl; exit(-1); } if ( PrimaryUMModuleName == "unassigned" ) PrimaryUMModuleName = localModule; if ( (localModule != PrimaryUMModuleName) && DBMS ) { char* pcommand = 0; char *p; string argument = "n"; while(true) { cout << endl << "You selected to get the DBMS data." << endl; cout << "You need to run the calpontSupport command on module '" << PrimaryUMModuleName << "' to get that information." << endl; cout << "Or you can proceed on to get all data except the DBMS." << endl; pcommand = readline(" Do you want to proceed: (y or n) [n]: "); if (pcommand && *pcommand) { p = strtok(pcommand," "); argument = p; free(pcommand); pcommand = 0; } if (pcommand) { free(pcommand); pcommand = 0; } if( argument == "y") { cout << endl; break; } else if( argument == "n") exit (1); } } //get number of worker-nodes, will tell us if a single server system //get Parent OAM Module Name and setup of it's Custom OS files try{ string NumWorkers = sysConfig->getConfig("DBRM_Controller", "NumWorkers"); if ( NumWorkers == "1" ) singleServerInstall = "y"; } catch(...) {} if ( singleServerInstall == "n" && !LOCAL) if ( HARDWARE || SOFTWARE || CONFIG || RESOURCE || LOG || HADOOP ) if ( rootPassword.empty() ) { cout << "ERROR: Multi-Module System, Password Argument required or use '-lc' option, check help for more information" << endl; exit(-1); } //get Parent OAM Module Name and setup of it's Custom OS files //string parentOAMModuleName; ChildModule parentOAMModule; try{ parentOAMModule.moduleName = sysConfig->getConfig(SystemSection, "ParentOAMModuleName"); } catch(...) { cout << "ERROR: Problem getting Parent OAM Module Name" << endl; exit(-1); } //Get list of configured system modules SystemModuleTypeConfig sysModuleTypeConfig; try{ oam.getSystemConfig(sysModuleTypeConfig); } catch(...) { cout << "ERROR: Problem reading the Calpont System Configuration file" << endl; exit(-1); } string ModuleSection = "SystemModuleConfig"; for ( unsigned int i = 0 ; i < sysModuleTypeConfig.moduletypeconfig.size(); i++) { string moduleType = sysModuleTypeConfig.moduletypeconfig[i].ModuleType; int moduleCount = sysModuleTypeConfig.moduletypeconfig[i].ModuleCount; if ( moduleCount == 0 ) //no modules equipped for this Module Type, skip continue; //get IP addresses and Host Names DeviceNetworkList::iterator listPT = sysModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.begin(); for( ; listPT != sysModuleTypeConfig.moduletypeconfig[i].ModuleNetworkList.end() ; listPT++) { string moduleName = (*listPT).DeviceName; HostConfigList::iterator pt1 = (*listPT).hostConfigList.begin(); string moduleIPAddr = (*pt1).IPAddr; string moduleHostName = (*pt1).HostName; if ( moduleName == localModule) { localModuleHostName = moduleHostName; } //save Child modules if ( moduleName != localModule && moduleType != "xm") { childmodule.moduleName = moduleName; childmodule.moduleIP = moduleIPAddr; childmodule.hostName = moduleHostName; childmodulelist.push_back(childmodule); } if (moduleName == parentOAMModule.moduleName) { parentOAMModule.moduleIP = moduleIPAddr; parentOAMModule.hostName = moduleHostName; parentOAMModule.moduleName = moduleName; } } } //end of i for loop // create a clean Calpont Support Report system("rm -f *_configReport.txt"); system("rm -f *_dbmsReport.txt"); system("rm -f *_hardwareReport.txt"); system("rm -f *_logReport.txt"); system("rm -f *_bulklogReport.txt"); system("rm -f *_resourceReport.txt"); system("rm -f *_softwareReport.txt"); system("rm -f hadoopReport.txt"); // // Software // if ( SOFTWARE ) { string reportType = "software"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // Configuration // if ( CONFIG ) { string reportType = "config"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // Alarms and Calpont Logs // if ( LOG ) { string reportType = "log"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // Bulk Logs // if ( BULKLOG ) { string reportType = "bulklog"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // Hardware // if ( HARDWARE ) { string reportType = "hardware"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // Resources // if ( RESOURCE ) { string reportType = "resource"; pthread_t reportthread; int status = pthread_create (&reportthread, NULL, (void*(*)(void*)) &reportThread, &reportType); if ( status != 0 ) { cout << "ERROR: reportthread: pthread_create failed, return status = " + oam.itoa(status); } sleep(5); } // // DBMS // if ( DBMS ) { system("rm -f calpontSupportReport.txt;touch calpontSupportReport.txt"); title(); system("echo '=======================================================================' >> calpontSupportReport.txt"); system("echo '= DBMS Report =' >> calpontSupportReport.txt"); system("echo '=======================================================================' >> calpontSupportReport.txt"); // run DBMS report on local server cout << "Get dbms report data for " << localModule << endl; bool FAILED = false; if ( localModule != PrimaryUMModuleName ) { cout << " FAILED: run calpontSupport on '" << PrimaryUMModuleName << "' to get the dbrm report" << endl; FAILED = true; } else { // check if mysql is supported and get info string calpontMysql = installDir + "/mysql/bin/mysql --defaults-file=" + installDir + "/mysql/my.cnf -u root "; string cmd = calpontMysql + " -e 'status' > /tmp/idbmysql.log 2>&1"; system(cmd.c_str()); //check for mysql password set string pwprompt = " "; if (oam.checkLogStatus("/tmp/idbmysql.log", "ERROR 1045") ) { cout << "NOTE: MySQL root user password is set" << endl; //needs a password, was password entered on command line if ( mysqlpw == " " ) { //go check my.cnf string file = installDir + "/mysql/my.cnf"; ifstream oldFile (file.c_str()); vector <string> lines; char line[200]; string buf; while (oldFile.getline(line, 200)) { buf = line; string::size_type pos = buf.find("password",0); if (pos != string::npos) { string::size_type pos1 = buf.find("=",0); if (pos1 != string::npos) { pos = buf.find("#",0); if (pos == string::npos) { //password arg in my.cnf, go get password cout << "NOTE: Using password from my.cnf" << endl; mysqlpw = buf.substr(pos1+1,80); cout << mysqlpw << endl; break; } } } } oldFile.close(); if ( mysqlpw == " " ) { cout << "NOTE: No password provide on command line or found uncommented in my.cnf" << endl; cout << endl; string prompt = " *** Enter MySQL password > "; mysqlpw = getpass(prompt.c_str()); } } //check for mysql password set pwprompt = "--password="******" -e 'status' > /tmp/idbmysql.log 2>&1"; system(cmd.c_str()); if (oam.checkLogStatus("/tmp/idbmysql.log", "ERROR 1045") ) { cout << "FAILED: Failed login using MySQL root user password '" << mysqlpw << "'" << endl; FAILED = true; } } if (!FAILED) { // check if mysql is supported and get info string calpontMysql = installDir + "/mysql/bin/mysql --defaults-file=" + installDir + "/mysql/my.cnf -u root " + pwprompt; string cmd = calpontMysql + " -V > /dev/null 2>&1"; int ret = system(cmd.c_str()); if ( WEXITSTATUS(ret) == 0) { // run DBMS report info system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS InfiniDB Mysql Version ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " -e status ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'status' >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql Calpont System Column ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " -e desc calpontsys.syscolumn ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'desc calpontsys.syscolumn;' >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql Calpont System Table ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " -e desc calpontsys.systable ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'desc calpontsys.systable;' >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql Calpont System Catalog Data ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " calpontsys < " + installDir + "/mysql/dumpcat_mysql.sql ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " calpontsys < " + installDir + "/mysql/dumpcat_mysql.sql >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql Calpont System Table Data ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " -e select * from calpontsys.systable ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'select * from calpontsys.systable;' >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql Calpont Usernames ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " -e show databases ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'show databases;' >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql InfiniDB variables ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# " + calpontMysql + " show variables ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = calpontMysql + " -e 'show variables;' >> calpontSupportReport.txt"; system(cmd.c_str()); } } } system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** Database Size Report ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); string cmd = "echo '################# /bin/databaseSizeReport ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = installDir + "/bin/databaseSizeReport >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** DBMS Mysql InfiniDB config file ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# cat /mysql/my.cnf ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "cat " + installDir + "/mysql/my.cnf 2>/dev/null >> calpontSupportReport.txt"; system(cmd.c_str()); system("echo ' ' >> calpontSupportReport.txt"); system("echo '******************** Active Queries ********************' >> calpontSupportReport.txt"); system("echo ' ' >> calpontSupportReport.txt"); cmd = "echo '################# mcsadmin getActiveSqlStatement ################# ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "echo ' ' >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = installDir + "/bin/mcsadmin getActiveSqlStatement >> calpontSupportReport.txt"; system(cmd.c_str()); cmd = "cat calpontSupportReport.txt > " + localModule + "_dbmsReport.txt"; system(cmd.c_str()); } // // HADOOP // if (HADOOP) { if (LOCAL || childmodulelist.empty()) { cout << "Get hadoop report data" << endl; string cmd = installDir + "/bin/hadoopReport.sh " + localModule + " " + installDir + "\n"; cmd += " mv -f /tmp/hadoopReport.txt ."; FILE* pipe = popen(cmd.c_str(), "r"); if (!pipe) { cout << "Failed to get a pipe for hadoop health check commands" << endl; exit(-1); } pclose(pipe); } else { // only get hadoop report from parentOAMModule, because it's consistant view. parentmodulelist.push_back(parentOAMModule); threadInfo_t *st = new threadInfo_t; ChildModuleList::iterator iter = parentmodulelist.begin(); *st = boost::make_tuple(iter, "hadoop"); pthread_t hdthread; int status = pthread_create (&hdthread, NULL, (void*(*)(void*)) &childReportThread, st); if ( status != 0 ) { cout << "ERROR: childreportthread: pthread_create failed, return status = " + oam.itoa(status) << endl; } } } //wait for all threads to complete sleep(5); int wait = 0; while (true) { //cout << "check " << runningThreads << endl; if (runningThreads < 1) break; sleep(2); wait++; // give it 60 minutes to complete if ( wait >= 3600 * 5) { cout << "Timed out (60 minutes) waiting for Requests to complete" << endl; } } system("rm -f calpontSupportReport.txt"); system("unix2dos *Report.txt > /dev/null 2>&1"); system("rm -rf calpontSupportReport;mkdir calpontSupportReport;mv *Report.txt calpontSupportReport/. > /dev/null 2>&1;mv *Report.tar.gz calpontSupportReport/. > /dev/null 2>&1"); string cmd = "tar -zcf calpontSupportReport." + systemName + ".tar.gz calpontSupportReport/*"; system(cmd.c_str()); cout << endl << "Calpont Support Script Successfully completed, files located in calpontSupportReport." + systemName + ".tar.gz" << endl; }
void procmonMonitor() { ServerMonitor serverMonitor; Oam oam; //wait before monitoring is started sleep(60); // get current server name string moduleName; oamModuleInfo_t st; try { st = oam.getModuleInfo(); moduleName = boost::get<0>(st); } catch (...) { // Critical error, Log this event and exit LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("Failed to read local module Info"); msg.format(args); ml.logCriticalMessage(msg); exit(-1); } string msgPort = moduleName + "_ProcessMonitor"; int heartbeatCount = 0; // loop forever monitoring Local Process Monitor while(true) { ByteStream msg; ByteStream::byte requestID = LOCALHEARTBEAT; msg << requestID; try { MessageQueueClient mqRequest(msgPort); mqRequest.write(msg); // wait 10 seconds for response ByteStream::byte returnACK; ByteStream::byte returnRequestID; ByteStream::byte requestStatus; ByteStream receivedMSG; struct timespec ts = { 10, 0 }; try { receivedMSG = mqRequest.read(&ts); if (receivedMSG.length() > 0) { receivedMSG >> returnACK; receivedMSG >> returnRequestID; receivedMSG >> requestStatus; if ( returnACK == oam::ACK && returnRequestID == requestID) { // ACK for this request heartbeatCount = 0; } } else { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("procmonMonitor: ProcMon Msg timeout!!!"); msg.format(args); ml.logWarningMessage(msg); heartbeatCount++; if ( heartbeatCount > 2 ) { //Process Monitor not responding, restart it system("pkill ProcMon"); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("procmonMonitor: Restarting ProcMon"); msg.format(args); ml.logWarningMessage(msg); sleep(60); heartbeatCount = 0; } } mqRequest.shutdown(); } catch (SocketClosed &ex) { string error = ex.what(); LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("procmonMonitor: EXCEPTION ERROR on mqRequest.read: " + error); msg.format(args); ml.logErrorMessage(msg); } catch (...) { LoggingID lid(SERVER_MONITOR_LOG_ID); MessageLog ml(lid); Message msg; Message::Args args; args.add("procmonMonitor: EXCEPTION ERROR on mqRequest.read: Caught unknown exception"); msg.format(args); ml.logErrorMessage(msg); } } catch (exception& ex)