shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) { shared_ptr<LanguageInfo> info; std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap; for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) { if (!encoding.empty() && (*it)->info()->Encoding != encoding) { continue; } const int charSequenceLength = (*it)->charSequenceLength(); shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength]; if (stat.isNull()) { stat = new ZLMapBasedStatistics(); ZLStatisticsGenerator("\r\n ").generate( buffer, length, charSequenceLength, *stat ); statisticsMap[charSequenceLength] = stat; } const int criterion = (*it)->criterion(*stat); if (criterion > matchingCriterion) { info = (*it)->info(); matchingCriterion = criterion; } } return info; }
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) { shared_ptr<LanguageInfo> info; std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap; std::string ucs2; if ((unsigned char)buffer[0] == 0xFE && (unsigned char)buffer[1] == 0xFF) { ucs2 = "UTF-16BE"; } else if ((unsigned char)buffer[0] == 0xFF && (unsigned char)buffer[1] == 0xFE) { ucs2 = "UTF-16"; } for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) { if (ucs2.empty() || (*it)->info()->Encoding == ucs2) { const int charSequenceLength = (*it)->charSequenceLength(); shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength]; if (stat.isNull()) { stat = new ZLMapBasedStatistics(); ZLStatisticsGenerator("\r\n ").generate( buffer, length, charSequenceLength, *stat ); statisticsMap[charSequenceLength] = stat; } const int criterion = (*it)->criterion(*stat); if (criterion > matchingCriterion) { info = (*it)->info(); matchingCriterion = criterion; } } } return info; }
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) { shared_ptr<LanguageInfo> info; ZLMapBasedStatistics bufferStatistics;//= new ZLMapBasedStatistics(); ZLStatisticsGenerator("\r\n ").generate(buffer, length, 3, bufferStatistics); std::string ucs2; if ((unsigned char)buffer[0] == 0xFE && (unsigned char)buffer[1] == 0xFF) { ucs2 = "UTF-16BE"; } else if ((unsigned char)buffer[0] == 0xFF && (unsigned char)buffer[1] == 0xFE) { ucs2 = "UTF-16"; } for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) { if (ucs2.empty() || (*it)->info()->Encoding == ucs2) { int criterion = (*it)->criterion(bufferStatistics); //std::cerr << (*it)->info()->Language << " " << criterion << "\n"; if (criterion > matchingCriterion) { info = (*it)->info(); matchingCriterion = criterion; } } } return info; }
int main(int argc, char **argv) { if (argc != 2) { std::cerr << "usage:\n " << argv[0] << " <directory name with language examples in txt format>\n"; return 0; } ZLibrary::init(argc, argv); std::string directoryName = argv[1]; shared_ptr<ZLDir> directory = ZLFile(directoryName).directory(false); if (directory.isNull()) { std::cerr << "cannot open directory " << directoryName << "\n"; return 0; } std::vector<std::string> fileNames; directory->collectFiles(fileNames, false); ZLMapBasedStatistics tempStatistics, resultStatistics; int counter = 0; for (std::vector<std::string>::const_iterator it = fileNames.begin(); it != fileNames.end(); ++it) { ZLFile file(directory->itemPath(*it)); if (file.extension() != "txt") { continue; } else { ++counter; } ZLStatisticsGenerator("\n\r ").generate(file.path(), CHAR_SEQUENCE_SIZE, tempStatistics); if (counter == 1) { resultStatistics = tempStatistics; continue; } resultStatistics.retain(tempStatistics); } std::string outName (argv[1]); outName += "pattern.stat"; shared_ptr<ZLOutputStream> streamOut = ZLFile(outName).outputStream(); streamOut->open(); ZLMapBasedStatistics restrictedResultStatistics = resultStatistics.top(TOP_AMOUNT); restrictedResultStatistics.scaleToShort(); ZLStatisticsXMLWriter(*streamOut).writeStatistics(restrictedResultStatistics); streamOut->close(); ZLibrary::shutdown(); return 0; }