void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) {
	std::string language = book.language();
	if (!language.empty()) {
		return;
	}

	PluginCollection &collection = PluginCollection::Instance();
	if (language.empty()) {
		language = collection.DefaultLanguageOption.value();
	}
	if (collection.LanguageAutoDetectOption.value() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			if (!info->Language.empty()) {
				language = info->Language;
			}
		}
	}
	book.setLanguage(language);
}
Esempio n. 2
0
ZLZipEntryCache::ZLZipEntryCache(const std::string &containerName, ZLInputStream &containerStream) : myContainerName(containerName) {
	//ZLLogger::Instance().println("ZipEntryCache", "creating cache for " + containerName);
	myLastModifiedTime = ZLFile(containerName).lastModified();
	if (!containerStream.open()) {
		return;
	}

	ZLZipHeader header;
	while (header.readFrom(containerStream)) {
		Info *infoPtr = 0;
		if (header.Signature == (unsigned long)ZLZipHeader::SignatureLocalFile) {
			std::string entryName(header.NameLength, '\0');
			if ((unsigned int)containerStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) {
				entryName = AndroidUtil::convertNonUtfString(entryName);
				Info &info = myInfoMap[entryName];
				info.Offset = containerStream.offset() + header.ExtraLength;
				info.CompressionMethod = header.CompressionMethod;
				info.CompressedSize = header.CompressedSize;
				info.UncompressedSize = header.UncompressedSize;
				infoPtr = &info;
			}
		}
		ZLZipHeader::skipEntry(containerStream, header);
		if (infoPtr != 0) {
			infoPtr->UncompressedSize = header.UncompressedSize;
		}
	}
	containerStream.close();
}
Esempio n. 3
0
ZLZipEntryCache::ZLZipEntryCache(ZLInputStream &baseStream) {
	if (!baseStream.open()) {
		return;
	}

	ZLZipHeader header;
	while (header.readFrom(baseStream)) {
		Info *infoPtr = 0;
		if (header.Signature == ZLZipHeader::SignatureLocalFile) {
			std::string entryName(header.NameLength, '\0');
			if ((unsigned int)baseStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) {
				Info &info = myInfoMap[entryName];
				info.Offset = baseStream.offset() + header.ExtraLength;
				info.CompressionMethod = header.CompressionMethod;
				info.CompressedSize = header.CompressedSize;
				info.UncompressedSize = header.UncompressedSize;
				infoPtr = &info;
			}
		}
		ZLZipHeader::skipEntry(baseStream, header);
		if (infoPtr != 0) {
			infoPtr->UncompressedSize = header.UncompressedSize;
		}
	}
	baseStream.close();
}
Esempio n. 4
0
bool FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force) {
	std::string language = book.language();
	if (!force && !language.empty()) {
		return true;
	}

	bool detected = false;

	PluginCollection &collection = PluginCollection::Instance();
	if (collection.isLanguageAutoDetectEnabled() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const std::size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfoForEncoding(encoding, buffer, size, -20000);
		delete[] buffer;
		if (!info.isNull()) {
			detected = true;
			if (!info->Language.empty()) {
				language = info->Language;
			}
		}
	}
	book.setLanguage(language);

	return detected;
}
Esempio n. 5
0
size_t HuffDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, size_t compressedSize, size_t maxUncompressedSize) {
	if (compressedSize == 0 || myErrorCode == ERROR_CORRUPTED_FILE) {
		return 0;
	}
	if (targetBuffer != 0) {
		unsigned char *sourceBuffer = new unsigned char[compressedSize];
		myTargetBuffer = targetBuffer;
		myTargetBufferEnd = targetBuffer + maxUncompressedSize;
		myTargetBufferPtr = targetBuffer;
		if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) {
			const size_t trailSize = sizeOfTrailingEntries(sourceBuffer, compressedSize);
			if (trailSize < compressedSize) {
				bitsDecompress(BitReader(sourceBuffer, compressedSize - trailSize));
			} else {
				myErrorCode = ERROR_CORRUPTED_FILE;
			}
		}
		delete[] sourceBuffer;
	} else {
		myTargetBuffer = 0;
		myTargetBufferEnd = 0;
		myTargetBufferPtr = 0;
	}

	return myTargetBufferPtr - myTargetBuffer;
}
void FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream) {
	std::string language = book.language();
	std::string encoding = book.encoding();
	if (!encoding.empty() && !language.empty()) {
		return;
	}

	PluginCollection &collection = PluginCollection::Instance();
	if (language.empty()) {
		language = collection.DefaultLanguageOption.value();
	}
	if (encoding.empty()) {
		encoding = collection.DefaultEncodingOption.value();
	}
	if (collection.LanguageAutoDetectOption.value() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			if (!info->Language.empty()) {
				language = info->Language;
			}
			encoding = info->Encoding;
			if ((encoding == "US-ASCII") || (encoding == "ISO-8859-1")) {
				encoding = "windows-1252";
			}
		}
	}
	book.setEncoding(encoding);
	book.setLanguage(language);
}
Esempio n. 7
0
size_t DocDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, size_t compressedSize, size_t maxUncompressedSize) {
	const unsigned char *sourceBuffer = new unsigned char[compressedSize];
	const unsigned char *sourceBufferEnd = sourceBuffer + compressedSize;
	const unsigned char *sourcePtr = sourceBuffer;

	unsigned char *targetBufferEnd = (unsigned char*)targetBuffer + maxUncompressedSize;
	unsigned char *targetPtr = (unsigned char*)targetBuffer;

	if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) {
		unsigned char token;
		unsigned short copyLength, N, shift;
		unsigned char *shifted;

		while ((sourcePtr < sourceBufferEnd) && (targetPtr < targetBufferEnd)) {
			token = *(sourcePtr++);
			switch (TOKEN_CODE[token]) {
				case 0:
					*(targetPtr++) = token;
					break;
				case 1:
					if ((sourcePtr + token > sourceBufferEnd) || (targetPtr + token > targetBufferEnd)) {
						goto endOfLoop;
					}
					memcpy(targetPtr, sourcePtr, token);
					sourcePtr += token;
					targetPtr += token;
					break;
				case 2:
					if (targetPtr + 2 > targetBufferEnd) {
						goto endOfLoop;
					}
					*(targetPtr++) = ' ';
					*(targetPtr++) = token ^ 0x80;
					break;
				case 3:
					if (sourcePtr + 1 > sourceBufferEnd) {
						goto endOfLoop;
					}
					N = 256 * token + *(sourcePtr++);
					copyLength = (N & 7) + 3;
					if (targetPtr + copyLength > targetBufferEnd) {
						goto endOfLoop;
					}
					shift = (N & 0x3fff) / 8;
					shifted = targetPtr - shift;
					if ((char*)shifted >= targetBuffer) {
						for (short i = 0; i < copyLength; i++) {
							*(targetPtr++) = *(shifted++);
						}
					}
					break;
			}
		}
	}
endOfLoop:

	delete[] sourceBuffer;
	return targetPtr - (unsigned char*)targetBuffer;
}
Esempio n. 8
0
bool ZLTarHeader::read(ZLInputStream &stream) {
	size_t startOffset = stream.offset();

	char fileName[101];
	stream.read(fileName, 100);
	if (fileName[0] == '\0') {
		return false;
	}
	fileName[100] = '\0';
	if (Name.empty()) {
		Name = fileName;
	}

	stream.seek(24, false);
	
	char fileSizeString[12];
	stream.read(fileSizeString, 12);
	Size = 0;
	for (int i = 0; i < 12; ++i) {
		if (!isdigit(fileSizeString[i])) {
			break;
		}
		Size *= 8;
		Size += fileSizeString[i] - '0';
	}
	
	stream.seek(20, false);
	char linkFlag;
	stream.read(&linkFlag, 1);
	
	IsRegularFile = (linkFlag == '\0') || (linkFlag == '0');

	stream.seek(355, false);
	
	if (((linkFlag == 'L') || (linkFlag == 'K')) && (Name == "././@LongLink") && (Size < 10240)) {
		Name.erase();
		Name.append(Size - 1, '\0');
		stream.read(const_cast<char*>(Name.data()), Size - 1);
		const int skip = 512 - (Size & 0x1ff);
		stream.seek(skip + 1, false);
		return (stream.offset() == startOffset + Size + skip + 512) && read(stream);
	} else {
		DataOffset = stream.offset();
		return DataOffset == startOffset + 512;
	}
}
Esempio n. 9
0
void PdbUtil::readUnsignedLongLE(ZLInputStream &stream, unsigned long &N) {
	unsigned char data[4];
	stream.read((char*)data, 4);
	N = (((unsigned long)data[3]) << 24) +
			(((unsigned long)data[2]) << 16) +
			(((unsigned long)data[1]) << 8) +
			(unsigned long)data[0];
}
Esempio n. 10
0
static unsigned short readUnsignedWord(ZLInputStream &stream) {
	unsigned char buffer[2];
	stream.read((char*)buffer, 2);
	unsigned short result = buffer[1];
	result = result << 8;
	result += buffer[0];
	return result;
}
Esempio n. 11
0
void PdbUtil::readUnsignedShort(ZLInputStream &stream, unsigned short &N) {
	unsigned char data[2];
	stream.read((char*)data, 2);
	N = (((unsigned short)data[0]) << 8) + data[1];
	/*
	stream.read((char*)&N + 1, 1);
	stream.read((char*)&N, 1);
	*/
}
Esempio n. 12
0
static unsigned long long readEncodedInteger(ZLInputStream &stream) {
	unsigned long long result = 0;
	char part;
	do {
		result = result << 7;
		stream.read(&part, 1);
		result += part & 0x7F;
	} while (part & -0x80);
	return result;
}
Esempio n. 13
0
unsigned long ZLZipHeader::readLong(ZLInputStream &stream) {
	char buffer[4];
	stream.read(buffer, 4);

	return
		((((unsigned long)buffer[3]) & 0xFF) << 24) +
		((((unsigned long)buffer[2]) & 0xFF) << 16) +
		((((unsigned long)buffer[1]) & 0xFF) << 8) +
		((unsigned long)buffer[0] & 0xFF);
}
Esempio n. 14
0
static std::string readNTString(ZLInputStream &stream) {
	std::string s;
	char c;
	while (stream.read(&c, 1) == 1) {
		if (c == '\0') {
			break;
		} else {
			s += c;
		}
	}
	return CHMReferenceCollection::fullReference("/", s);
}
Esempio n. 15
0
void TxtReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}

	startDocumentHandler();

	const size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::string str;
	size_t length;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		const char *end = buffer + length;
		for (char *ptr = start; ptr != end; ++ptr) {
			if (*ptr == '\n' || *ptr == '\r') {
				bool skipNewLine = false;
				if (*ptr == '\r' && (ptr + 1) != end && *(ptr + 1) == '\n') {
					skipNewLine = true;
					*ptr = '\n';
				}
				if (start != ptr) {
					str.erase();
					myConverter->convert(str, start, ptr + 1);
					characterDataHandler(str);
				}
				if (skipNewLine) {
					++ptr;
				}
				start = ptr + 1;
				newLineHandler();
			} else if (isspace((unsigned char)*ptr)) {
				if (*ptr != '\t') {
					*ptr = ' ';
				}
			} else {
			}
		}
		if (start != end) {
			str.erase();
			myConverter->convert(str, start, end);
			characterDataHandler(str);
		}
	} while (length == BUFSIZE);
	delete[] buffer;

	endDocumentHandler();

	stream.close();
}
Esempio n. 16
0
void PdbUtil::readUnsignedLong(ZLInputStream &stream, unsigned long &N) {
	unsigned char data[4];
	stream.read((char*)data, 4);
	N = (((unsigned long)data[0]) << 24) +
			(((unsigned long)data[1]) << 16) +
			(((unsigned long)data[2]) << 8) +
			(unsigned long)data[3];
	/*
	stream.read((char*)&N + 3, 1);
	stream.read((char*)&N + 2, 1);
	stream.read((char*)&N + 1, 1);
	stream.read((char*)&N, 1);
	*/
}
Esempio n. 17
0
void StyleSheetParser::parse(ZLInputStream &stream) {
	if (stream.open()) {
		char *buffer = new char[1024];
		while (true) {
			int len = stream.read(buffer, 1024);
			if (len == 0) {
				break;
			}
			parse(buffer, len);
		}
		delete[] buffer;
		stream.close();
	}
}
Esempio n. 18
0
HuffDecompressor::HuffDecompressor(ZLInputStream& stream, 
                        const std::vector<unsigned long>::const_iterator beginIt, 
                        const std::vector<unsigned long>::const_iterator endIt,
						const unsigned long endHuffDataOffset, const unsigned long extraFlags) : myExtraFlags(extraFlags), myErrorCode(ERROR_NONE) {
	

	const unsigned long huffHeaderOffset = *beginIt;
	const unsigned long huffRecordsNumber = endIt - beginIt;
	const unsigned long huffDataOffset = *(beginIt + 1);
	
	stream.seek(huffHeaderOffset, true);
	stream.seek(16, false);
	unsigned long cacheTableOffset, baseTableOffset;
	PdbUtil::readUnsignedLongBE(stream, cacheTableOffset);
	PdbUtil::readUnsignedLongBE(stream, baseTableOffset); 
	

	myCacheTable = new unsigned long[256];
	stream.seek(huffHeaderOffset + cacheTableOffset, true);
	for (size_t i = 0; i < 256; ++i) {
		PdbUtil::readUnsignedLongLE(stream, myCacheTable[i]); //LE
	}
	
	myBaseTable = new unsigned long[64]; 
	stream.seek(huffHeaderOffset + baseTableOffset, true);
	for (size_t i = 0; i < 64; ++i) {
		PdbUtil::readUnsignedLongLE(stream, myBaseTable[i]); //LE
	}
	
	stream.seek(huffDataOffset + 12, true);
	PdbUtil::readUnsignedLongBE(stream, myEntryBits);	
	
	size_t huffDataSize = endHuffDataOffset - huffDataOffset;
	myData = new unsigned char[huffDataSize];
	stream.seek(huffDataOffset, true);
	if (huffDataSize == stream.read((char*)myData, huffDataSize)) {	
		myDicts = new unsigned char* [huffRecordsNumber - 1];
		for(size_t i = 0; i < huffRecordsNumber - 1;  ++i) {	
			size_t shift = *(beginIt + i + 1) - huffDataOffset;
			myDicts[i] = myData + shift;
		}
	} else {
		myErrorCode = ERROR_CORRUPTED_FILE;
	}
	
	myTargetBuffer = 0;
	myTargetBufferEnd = 0;
	myTargetBufferPtr = 0;
}
static void readLine(ZLInputStream &stream, std::string &buffer) {
	buffer.clear();
	char ch;
	while (1) {
		if (stream.read(&ch, 1) != 1) {
			return;
		}
		if ((ch == 10) || (ch == 13)) {
			if (!buffer.empty()) {
				return;
			}
		} else {
			buffer += ch;
		}
	}
}
Esempio n. 20
0
bool PPLBookReader::readDocument(ZLInputStream &stream) {
	std::cout<<"PPLBookReader::readDocument\n";
	if (!stream.open()) {
		return false;
	}

	myModelReader.setMainTextModel();
	myModelReader.pushKind(REGULAR);
	myCurrentParagraph.erase();
	myEmptyLineCounter = 0;

	// "PPL\r\n"
	stream.seek(5);

	size_t size;
	do {
		size = stream.read(myBuffer, BUFFER_SIZE);
		myBuffer[size] = '\0';

		const char *start = myBuffer;
		const char *end = myBuffer + size;
		const char *eol;
		do {
			eol = strchr(start, '\n');
			if (eol != 0) {
				if (start < eol) {
					myConverter->convert(myCurrentParagraph, start, eol);
				}
				addParagraph();
				start = eol + 1;
			} else {
				if (start < end) {
					myConverter->convert(myCurrentParagraph, start, end);
				}
			}
		} while (eol != 0);
	} while (size == BUFFER_SIZE);

	addParagraph();

	stream.close();

	return true;
}
Esempio n. 21
0
void TxtReaderCoreUtf16::readDocument(ZLInputStream &stream) {
	const size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::string str;
	size_t length;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		const char *end = buffer + length;
		for (char *ptr = start; ptr < end; ptr += 2) {
			const char chr = getAscii(ptr);
			if (chr == '\n' || chr == '\r') {
				bool skipNewLine = false;
				if (chr == '\r' && ptr + 2 != end && getAscii(ptr + 2) == '\n') {
					skipNewLine = true;
					setAscii(ptr, '\n');
				}
				if (start != ptr) {
					str.erase();
					myReader.myConverter->convert(str, start, ptr + 2);
					myReader.characterDataHandler(str);
				}
				if (skipNewLine) {
					ptr += 2;
				}
				start = ptr + 2;
				myReader.newLineHandler();
			} else if (chr != 0 && isspace(chr)) {
				if (chr != '\t') {
					setAscii(ptr, ' ');
				}
			}
		}
		if (start != end) {
			str.erase();
			myReader.myConverter->convert(str, start, end);
			myReader.characterDataHandler(str);
		}
	} while (length == BUFSIZE);
	delete[] buffer;
}
Esempio n. 22
0
size_t ZLZDecompressor::decompress(ZLInputStream &stream, char *buffer, size_t maxSize) {
	while ((myBuffer.length() < maxSize) && (myAvailableSize > 0)) {
		size_t size = std::min(myAvailableSize, (size_t)IN_BUFFER_SIZE);

		myZStream->next_in = (Bytef*)myInBuffer;
		myZStream->avail_in = stream.read(myInBuffer, size);
		if (myZStream->avail_in == size) {
			myAvailableSize -= size;
		} else {
			myAvailableSize = 0;
		}
		while (myZStream->avail_in == 0) {
			break;
		}
		while (myZStream->avail_in > 0) {
			myZStream->avail_out = OUT_BUFFER_SIZE;
			myZStream->next_out = (Bytef*)myOutBuffer;
			int code = ::inflate(myZStream, Z_SYNC_FLUSH);
			if ((code != Z_OK) && (code != Z_STREAM_END)) {
				break;
			}
			if (OUT_BUFFER_SIZE == myZStream->avail_out) {
				break;
			}
			myBuffer.append(myOutBuffer, OUT_BUFFER_SIZE - myZStream->avail_out);
			if (code == Z_STREAM_END) {
				myAvailableSize = 0;
				stream.seek(0 - myZStream->avail_in, false);
				break;
			}
		}
	}

	size_t realSize = std::min(maxSize, myBuffer.length());
	if (buffer != 0) {
		memcpy(buffer, myBuffer.data(), realSize);
	}
	myBuffer.erase(0, realSize);
	return realSize;
}
Esempio n. 23
0
bool FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force) {
	std::string language = book.language();
	std::string encoding = book.encoding();

	if (!force && !encoding.empty()) {
		return true;
	}

	bool detected = false;
	PluginCollection &collection = PluginCollection::Instance();
	if (encoding.empty()) {
		encoding = ZLEncodingConverter::UTF8;
	}
	if (collection.isLanguageAutoDetectEnabled() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const std::size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			detected = true;
			if (!info->Language.empty()) {
				language = info->Language;
			}
			encoding = info->Encoding;
			if (encoding == ZLEncodingConverter::ASCII || encoding == "iso-8859-1") {
				encoding = "windows-1252";
			}
		}
	}
	book.setEncoding(encoding);
	book.setLanguage(language);

	return detected;
}
Esempio n. 24
0
static std::string readString(ZLInputStream &stream, std::size_t length) {
	std::string string(length, ' ');
	stream.read(const_cast<char*>(string.data()), length);
	return string;
}
Esempio n. 25
0
void HtmlReader::readDocument(ZLInputStream &stream) {
    std::cout<<"HtmlReader\n";
    if (!stream.open()) {
        return;
    }

    startDocumentHandler();

    ParseState state = PS_TEXT;
    SpecialType state_special = ST_UNKNOWN;
    std::string currentString;
    int quotationCounter = 0;
    HtmlTag currentTag;
    char endOfComment[2] = "\0";

    const size_t BUFSIZE = 2048;
    char *buffer = new char[BUFSIZE];
    size_t length;
    do {
        length = stream.read(buffer, BUFSIZE);
        char *start = buffer;
        char *endOfBuffer = buffer + length;
        for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
            switch (state) {
            case PS_TEXT:
                if (*ptr == '<') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_TAGSTART;
                }
                if (*ptr == '&') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_SPECIAL;
                    state_special = ST_UNKNOWN;
                }
                break;
            case PS_SPECIAL:
                if (state_special == ST_UNKNOWN) {
                    if (*ptr == '#') {
                        state_special = ST_NUM;
                    } else if (isalpha(*ptr)) {
                        state_special = ST_NAME;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else if (state_special == ST_NUM) {
                    if (*ptr == 'x') {
                        state_special = ST_HEX;
                    } else if (isdigit(*ptr)) {
                        state_special = ST_DEC;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else {
                    if (*ptr == ';') {
                        currentString.append(start, ptr - start);
                        int number = specialSymbolNumber(state_special, currentString);
                        if (number != 0) {
                            char buffer[4];
                            int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number);
                            characterDataHandler(buffer, len, false);
                        } else {
                            currentString = "&" + currentString + ";";
                            characterDataHandler(currentString.c_str(), currentString.length(), false);
                        }
                        currentString.erase();
                        start = ptr + 1;
                        state = PS_TEXT;
                    } else if (!allowSymbol(state_special, *ptr)) {
                        start = ptr;
                        state = PS_TEXT;
                    }
                }
                break;
            case PS_TAGSTART:
                state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
                break;
            case PS_COMMENT:
                if ((endOfComment[0] == '\0') && (*ptr != '-')) {
                    state = PS_TAGNAME;
                } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
                    start = ptr + 1;
                    state = PS_TEXT;
                    endOfComment[0] = '\0';
                    endOfComment[1] = '\0';
                } else {
                    endOfComment[0] = endOfComment[1];
                    endOfComment[1] = *ptr;
                }
                break;
            case PS_TAGNAME:
                if ((*ptr == '>') || isspace(*ptr)) {
                    currentString.append(start, ptr - start);
                    start = ptr + 1;
                    setTag(currentTag, currentString);
                    currentString.erase();
                    if (currentTag.Name == "") {
                        state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG;
                    } else {
                        if (*ptr == '>') {
                            if (!tagHandler(currentTag)) {
                                goto endOfProcessing;
                            }
                            state = PS_TEXT;
                        } else {
                            state = PS_ATTRIBUTENAME;
                        }
                    }
                }
                break;
            case PS_ATTRIBUTENAME:
                if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        for (unsigned int i = 0; i < currentString.length(); ++i) {
                            currentString[i] = toupper(currentString[i]);
                        }
                        currentTag.addAttribute(currentString);
                        currentString.erase();
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_ATTRIBUTEVALUE:
                if (*ptr == '"') {
                    if ((ptr == start) || (quotationCounter > 0)) {
                        ++quotationCounter;
                    }
                } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        if (currentString[0] == '"') {
                            currentString = currentString.substr(1, currentString.length() - 2);
                        }
                        currentTag.setLastAttributeValue(currentString);
                        currentString.erase();
                        quotationCounter = 0;
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_SKIPTAG:
                if (*ptr == '>') {
                    start = ptr + 1;
                    state = PS_TEXT;
                }
                break;
            }
        }
        if (start != endOfBuffer) {
            switch (state) {
            case PS_TEXT:
                if (!characterDataHandler(start, endOfBuffer - start, true)) {
                    goto endOfProcessing;
                }
                break;
            case PS_TAGNAME:
            case PS_ATTRIBUTENAME:
            case PS_ATTRIBUTEVALUE:
            case PS_SPECIAL:
                currentString.append(start, endOfBuffer - start);
                break;
            case PS_TAGSTART:
            case PS_SKIPTAG:
            case PS_COMMENT:
                break;
            }
        }
    } while (length == BUFSIZE);
endOfProcessing:
    delete[] buffer;

    endDocumentHandler();

    stream.close();
}
Esempio n. 26
0
unsigned short ZLZipHeader::readShort(ZLInputStream &stream) {
	char buffer[2];
	stream.read(buffer, 2);
	return ((((unsigned short)buffer[1]) & 0xFF) << 8) + ((unsigned short)buffer[0] & 0xFF);
}
Esempio n. 27
0
void HtmlReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}

	startDocumentHandler();

	ParseState state = PS_TEXT;
	SpecialType state_special = ST_UNKNOWN;
	std::string currentString;
	std::string attributeValueString;
	std::string specialString;
	int quotationCounter = 0;
	HtmlTag currentTag;
	char endOfComment[2] = "\0";

	const std::size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::size_t length;
	std::size_t offset = 0;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		char *endOfBuffer = buffer + length;
		for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
			switch (state) {
				case PS_TEXT:
					if (*ptr == '<') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_TAGSTART;
						currentTag.Offset = offset + (ptr - buffer);
					}
					if (*ptr == '&') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_SPECIAL;
						state_special = ST_UNKNOWN;
					}
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					if (state_special == ST_UNKNOWN) {
						if (*ptr == '#') {
							state_special = ST_NUM;
						} else if (std::isalpha(*ptr)) {
							state_special = ST_NAME;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else if (state_special == ST_NUM) {
						if (*ptr == 'x') {
							state_special = ST_HEX;
						} else if (std::isdigit(*ptr)) {
							state_special = ST_DEC;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else {
						if (*ptr == ';') {
							specialString.append(start, ptr - start);
							const int number = specialSymbolNumber(state_special, specialString);
							if (128 <= number && number <= 159) {
								char ch = number;
								if (state == PS_SPECIAL) {
									characterDataHandler(&ch, 1, true);
								} else {
									myConverter->convert(attributeValueString, &ch, &ch + 1);
								}
							} else if (number != 0) {
								char buffer[4];
								int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
								if (state == PS_SPECIAL) {
									characterDataHandler(buffer, len, false);
								} else {
									attributeValueString.append(buffer, len);
								}
							} else {
								specialString = "&" + specialString + ";";
								if (state == PS_SPECIAL) {
									characterDataHandler(specialString.c_str(), specialString.length(), false);
								} else {
									attributeValueString += specialString;
								}
							}
							specialString.erase();
							start = ptr + 1;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						} else if (!allowSymbol(state_special, *ptr)) {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					}
					break;
				case PS_TAGSTART:
					state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME;
					break;
				case PS_COMMENT:
					if (endOfComment[0] == '\0' && *ptr != '-') {
						state = PS_TAGNAME;
					} else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
						endOfComment[0] = '\0';
						endOfComment[1] = '\0';
					} else {
						endOfComment[0] = endOfComment[1];
						endOfComment[1] = *ptr;
					}
					break;
				case PS_WAIT_END_OF_TAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
				case PS_TAGNAME:
					if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						setTag(currentTag, currentString);
						currentString.erase();
						if (currentTag.Name == "") {
							state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
						} else {
							if (*ptr == '>') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_TEXT;
							} else if (*ptr == '/') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								currentTag.Start = false;
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_WAIT_END_OF_TAG;
							} else {
								state = PS_ATTRIBUTENAME;
							}
						}
					}
					break;
				case PS_ATTRIBUTENAME:
					if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							ZLStringUtil::asciiToLowerInline(currentString);
							currentTag.addAttribute(currentString);
							currentString.erase();
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_ATTRIBUTEVALUE:
					if (*ptr == '"') {
						if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
							++quotationCounter;
						}
					} else if (*ptr == '&') {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						appendString(attributeValueString, currentString);
						state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
						state_special = ST_UNKNOWN;
					} else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							appendString(attributeValueString, currentString);
							if (attributeValueString[0] == '"') {
								attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
							}
							currentTag.setLastAttributeValue(attributeValueString);
							attributeValueString.erase();
							quotationCounter = 0;
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_SKIPTAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
			}
		}
		if (start != endOfBuffer) {
			switch (state) {
				case PS_TEXT:
					if (!characterDataHandler(start, endOfBuffer - start, true)) {
						goto endOfProcessing;
					}
					break;
				case PS_TAGNAME:
				case PS_ATTRIBUTENAME:
				case PS_ATTRIBUTEVALUE:
					currentString.append(start, endOfBuffer - start);
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					specialString.append(start, endOfBuffer - start);
					break;
				case PS_TAGSTART:
				case PS_SKIPTAG:
				case PS_COMMENT:
				case PS_WAIT_END_OF_TAG:
					break;
			}
		}
		offset += length;
	} while (length == BUFSIZE);
endOfProcessing:
	delete[] buffer;

	endDocumentHandler();

	stream.close();
}
void PlainTextFormatDetector::detect(ZLInputStream &stream, PlainTextFormat &format) {
    if (!stream.open()) {
        return;
    }

    const unsigned int tableSize = 10;

    unsigned int lineCounter = 0;
    int emptyLineCounter = -1;
    unsigned int stringsWithLengthLessThan81Counter = 0;
    unsigned int stringIndentTable[tableSize] = { 0 };
    unsigned int emptyLinesTable[tableSize] = { 0 };
    unsigned int emptyLinesBeforeShortStringTable[tableSize] = { 0 };

    bool currentLineIsEmpty = true;
    unsigned int currentLineLength = 0;
    unsigned int currentLineIndent = 0;
    int currentNumberOfEmptyLines = -1;

    char *buffer = new char[BUFFER_SIZE];
    int length;
    char previous = 0;
    do {
        length = stream.read(buffer, BUFFER_SIZE);
        const char *end = buffer + length;
        for (const char *ptr = buffer; ptr != end; ++ptr) {
            ++currentLineLength;
            if (*ptr == '\n') {
                ++lineCounter;
                if (currentLineIsEmpty) {
                    ++emptyLineCounter;
                    ++currentNumberOfEmptyLines;
                } else {
                    if (currentNumberOfEmptyLines >= 0) {
                        int index = std::min(currentNumberOfEmptyLines, (int)tableSize - 1);
                        emptyLinesTable[index]++;
                        if (currentLineLength < 51) {
                            emptyLinesBeforeShortStringTable[index]++;
                        }
                    }
                    currentNumberOfEmptyLines = -1;
                }
                if (currentLineLength < 81) {
                    ++stringsWithLengthLessThan81Counter;
                }
                if (!currentLineIsEmpty) {
                    stringIndentTable[std::min(currentLineIndent, tableSize - 1)]++;
                }

                currentLineIsEmpty = true;
                currentLineLength = 0;
                currentLineIndent = 0;
            } else if (*ptr == '\r') {
                continue;
            } else if (isspace((unsigned char)*ptr)) {
                if (currentLineIsEmpty) {
                    ++currentLineIndent;
                }
            } else {
                currentLineIsEmpty = false;
            }
            previous = *ptr;
        }
    } while (length == BUFFER_SIZE);
    delete[] buffer;

    unsigned int nonEmptyLineCounter = lineCounter - emptyLineCounter;

    {
        unsigned int indent = 0;
        unsigned int lineWithIndent = 0;
        for (; indent < tableSize; ++indent) {
            lineWithIndent += stringIndentTable[indent];
            if (lineWithIndent > 0.1 * nonEmptyLineCounter) {
                break;
            }
        }
        format.IgnoredIndentOption.setValue(indent + 1);
    }

    {
        int breakType = 0;
        breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE;
        breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE;
        if (stringsWithLengthLessThan81Counter >= 0.5 * nonEmptyLineCounter) {
            breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT;
        }
        format.BreakTypeOption.setValue(breakType);
    }

    {
        unsigned int max = 0;
        unsigned index;
        int emptyLinesBeforeNewSection = -1;
        for (index = 2; index < tableSize; ++index) {
            if (max < emptyLinesBeforeShortStringTable[index]) {
                max = emptyLinesBeforeShortStringTable[index];
                emptyLinesBeforeNewSection = index;
            }
        }
        if (emptyLinesBeforeNewSection > 0) {
            for (index = tableSize - 1; index > 0; --index) {
                emptyLinesTable[index - 1] += emptyLinesTable[index];
                emptyLinesBeforeShortStringTable[index - 1] += emptyLinesBeforeShortStringTable[index];
            }
            for (index = emptyLinesBeforeNewSection; index < tableSize; ++index) {
                if ((emptyLinesBeforeShortStringTable[index] > 2) &&
                    (emptyLinesBeforeShortStringTable[index] > 0.7 * emptyLinesTable[index])) {
                        break;
                }
            }
            emptyLinesBeforeNewSection = (index == tableSize) ? -1 : (int)index;
        }
        format.EmptyLinesBeforeNewSectionOption.setValue(emptyLinesBeforeNewSection);
        format.CreateContentsTableOption.setValue(emptyLinesBeforeNewSection > 0);
    }

    format.InitializedOption.setValue(true);
}
Esempio n. 29
0
bool PmlReader::parseDocument(ZLInputStream &stream) {
	enum {
		READ_NORMAL_DATA,
		READ_TAG,
		READ_TAG_PARAMETER,
	} parserState = READ_NORMAL_DATA;

	size_t tagNameLength = 0;
	std::string tagName;
	std::string parameterString;
	
	bool startParameterReading = false;
	size_t tagCounter = 0;
	static bool FLAG = true;

	while (!myIsInterrupted) {
		const char *ptr = myStreamBuffer;
		const char *end = myStreamBuffer + stream.read(myStreamBuffer, pmlStreamBufferSize);
		if (ptr == end) {
			break;
		}
		const char *dataStart = ptr;
		bool readNextChar = true;
		while (ptr != end) {
			switch (parserState) {
				case READ_NORMAL_DATA:
					if (*ptr == '\n') {
						if (ptr > dataStart) {
							processCharData(dataStart, ptr - dataStart);
						}
						newLine();
						FLAG = true;
						dataStart = ptr + 1;
					} else if (FLAG && isspace(*ptr)) {
					} else {
						FLAG = false;
						if (*ptr == '\\') {
							if (ptr > dataStart) {
								processCharData(dataStart, ptr - dataStart);
							}
							dataStart = ptr + 1;
							tagName.erase();
							parserState = READ_TAG;
						}
					}
					break;
				case READ_TAG:
					if ((ptr == dataStart) && (tagName.empty())) {
						if (*ptr == '\\') {
							processCharData(ptr, 1);
							dataStart = ptr + 1;
							parserState = READ_NORMAL_DATA;
						} else {
							tagNameLength = findTagLength(ptr);
							if (tagNameLength == 0) {
								dataStart = ptr + 1;
								parserState = READ_NORMAL_DATA;
								++tagCounter;
							} else {
								--tagNameLength;
							}
						}
					} else {
						if (tagNameLength == 0) {
							tagName.append(dataStart, ptr - dataStart);
							if (*ptr == '=') {
								dataStart = ptr + 1; 
								parameterString.erase();
								parserState = READ_TAG_PARAMETER;
								++tagCounter;
							} else {
								readNextChar = false;
								processTag(tagName);
								dataStart = ptr;
								parserState = READ_NORMAL_DATA;
								++tagCounter;
							}
						} else {
							--tagNameLength;
						}
					}
					break;
				case READ_TAG_PARAMETER:
					if (*ptr == '"') { 
						if (!startParameterReading) {
							startParameterReading = true;
							dataStart = ptr + 1;  
						} else {
							parameterString.append(dataStart, ptr - dataStart);
							processTag(tagName, parameterString);
							parserState = READ_NORMAL_DATA;
							dataStart =  ptr + 1;
							startParameterReading = false;
						}
					}
					break;
			}
			if (readNextChar) {
				++ptr;
			} else {
				readNextChar = true;
			}
		}
		if (dataStart < end) {
			switch (parserState) {
				case READ_NORMAL_DATA:
					processCharData(dataStart, end - dataStart);
				case READ_TAG:
					tagName.append(dataStart, end - dataStart);
					break;
				case READ_TAG_PARAMETER:
					parameterString.append(dataStart, end - dataStart);
					break;
				default:
					break;
			}
		}
	}
	return myIsInterrupted;
}