Example #1
0
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfoForEncoding(const std::string &encoding, const char *buffer, size_t length, int matchingCriterion) {
	shared_ptr<LanguageInfo> info;
	std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
	for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
		if (!encoding.empty() && (*it)->info()->Encoding != encoding) {
			continue;
		}

		const int charSequenceLength = (*it)->charSequenceLength();
		shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
		if (stat.isNull()) {
			stat = new ZLMapBasedStatistics();
			ZLStatisticsGenerator("\r\n ").generate(
				buffer, length, charSequenceLength, *stat
			);
			statisticsMap[charSequenceLength] = stat;
		}
		const int criterion = (*it)->criterion(*stat);
		if (criterion > matchingCriterion) {
			info = (*it)->info();
			matchingCriterion = criterion;
		}
	}
	return info;
}
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
	shared_ptr<LanguageInfo> info;
	std::map<int,shared_ptr<ZLMapBasedStatistics> > statisticsMap;
	std::string ucs2;
	if ((unsigned char)buffer[0] == 0xFE &&
			(unsigned char)buffer[1] == 0xFF) {
		ucs2 = "UTF-16BE";	
	} else 
	if ((unsigned char)buffer[0] == 0xFF &&
			(unsigned char)buffer[1] == 0xFE) {
		ucs2 = "UTF-16";	
	}
	for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
		if (ucs2.empty() || (*it)->info()->Encoding == ucs2) {
			const int charSequenceLength = (*it)->charSequenceLength();
			shared_ptr<ZLMapBasedStatistics> stat = statisticsMap[charSequenceLength];
			if (stat.isNull()) {
				stat = new ZLMapBasedStatistics();
				ZLStatisticsGenerator("\r\n ").generate(
					buffer, length, charSequenceLength, *stat
				);
				statisticsMap[charSequenceLength] = stat;
			}
			const int criterion = (*it)->criterion(*stat);
			if (criterion > matchingCriterion) {
				info = (*it)->info();
				matchingCriterion = criterion;
			}
		}
	}
	return info;
}
shared_ptr<ZLLanguageDetector::LanguageInfo> ZLLanguageDetector::findInfo(const char *buffer, size_t length, int matchingCriterion) {
	shared_ptr<LanguageInfo> info;
	ZLMapBasedStatistics bufferStatistics;//= new ZLMapBasedStatistics();
	ZLStatisticsGenerator("\r\n ").generate(buffer, length, 3, bufferStatistics);
	std::string ucs2;
	if ((unsigned char)buffer[0] == 0xFE &&
			(unsigned char)buffer[1] == 0xFF) {
		ucs2 = "UTF-16BE";	
	} else 
	if ((unsigned char)buffer[0] == 0xFF &&
			(unsigned char)buffer[1] == 0xFE) {
		ucs2 = "UTF-16";	
	}
	for (SBVector::const_iterator it = myMatchers.begin(); it != myMatchers.end(); ++it) {
		if (ucs2.empty() || (*it)->info()->Encoding == ucs2) {
			int criterion = (*it)->criterion(bufferStatistics);
			//std::cerr << (*it)->info()->Language << " " << criterion << "\n";
			if (criterion > matchingCriterion) {
				info = (*it)->info();
				matchingCriterion = criterion;
			}
		}
	}
	return info;
}
Example #4
0
int main(int argc, char **argv) {
	if (argc != 2) {
		std::cerr << "usage:\n  " << argv[0] << " <directory name with language examples in txt format>\n";
		return 0;
	}
	ZLibrary::init(argc, argv);

	std::string directoryName = argv[1];
	shared_ptr<ZLDir> directory = ZLFile(directoryName).directory(false);
	if (directory.isNull()) { 
		std::cerr << "cannot open directory  " << directoryName << "\n";
		return 0;
	}
	std::vector<std::string> fileNames;
	directory->collectFiles(fileNames, false);
	ZLMapBasedStatistics tempStatistics, resultStatistics;
	int counter = 0;
	for (std::vector<std::string>::const_iterator it = fileNames.begin(); it != fileNames.end(); ++it) {
		ZLFile file(directory->itemPath(*it));
		if (file.extension() != "txt") {
			continue;
		} else {
			++counter;
		}
		ZLStatisticsGenerator("\n\r ").generate(file.path(), CHAR_SEQUENCE_SIZE, tempStatistics);
		if (counter == 1) {
			resultStatistics = tempStatistics;
			continue;
		}
		resultStatistics.retain(tempStatistics);
	}
	std::string outName (argv[1]);
	outName += "pattern.stat";
	shared_ptr<ZLOutputStream> streamOut = ZLFile(outName).outputStream();
	streamOut->open();
	ZLMapBasedStatistics restrictedResultStatistics = resultStatistics.top(TOP_AMOUNT);
	restrictedResultStatistics.scaleToShort();
	ZLStatisticsXMLWriter(*streamOut).writeStatistics(restrictedResultStatistics);
	streamOut->close();
	ZLibrary::shutdown();
	return 0;
}