void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) { std::string language = book.language(); if (!language.empty()) { return; } PluginCollection &collection = PluginCollection::Instance(); if (language.empty()) { language = collection.DefaultLanguageOption.value(); } if (collection.LanguageAutoDetectOption.value() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { if (!info->Language.empty()) { language = info->Language; } } } book.setLanguage(language); }
ZLZipEntryCache::ZLZipEntryCache(const std::string &containerName, ZLInputStream &containerStream) : myContainerName(containerName) { //ZLLogger::Instance().println("ZipEntryCache", "creating cache for " + containerName); myLastModifiedTime = ZLFile(containerName).lastModified(); if (!containerStream.open()) { return; } ZLZipHeader header; while (header.readFrom(containerStream)) { Info *infoPtr = 0; if (header.Signature == (unsigned long)ZLZipHeader::SignatureLocalFile) { std::string entryName(header.NameLength, '\0'); if ((unsigned int)containerStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) { entryName = AndroidUtil::convertNonUtfString(entryName); Info &info = myInfoMap[entryName]; info.Offset = containerStream.offset() + header.ExtraLength; info.CompressionMethod = header.CompressionMethod; info.CompressedSize = header.CompressedSize; info.UncompressedSize = header.UncompressedSize; infoPtr = &info; } } ZLZipHeader::skipEntry(containerStream, header); if (infoPtr != 0) { infoPtr->UncompressedSize = header.UncompressedSize; } } containerStream.close(); }
ZLZipEntryCache::ZLZipEntryCache(ZLInputStream &baseStream) { if (!baseStream.open()) { return; } ZLZipHeader header; while (header.readFrom(baseStream)) { Info *infoPtr = 0; if (header.Signature == ZLZipHeader::SignatureLocalFile) { std::string entryName(header.NameLength, '\0'); if ((unsigned int)baseStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) { Info &info = myInfoMap[entryName]; info.Offset = baseStream.offset() + header.ExtraLength; info.CompressionMethod = header.CompressionMethod; info.CompressedSize = header.CompressedSize; info.UncompressedSize = header.UncompressedSize; infoPtr = &info; } } ZLZipHeader::skipEntry(baseStream, header); if (infoPtr != 0) { infoPtr->UncompressedSize = header.UncompressedSize; } } baseStream.close(); }
bool FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force) { std::string language = book.language(); if (!force && !language.empty()) { return true; } bool detected = false; PluginCollection &collection = PluginCollection::Instance(); if (collection.isLanguageAutoDetectEnabled() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const std::size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfoForEncoding(encoding, buffer, size, -20000); delete[] buffer; if (!info.isNull()) { detected = true; if (!info->Language.empty()) { language = info->Language; } } } book.setLanguage(language); return detected; }
size_t HuffDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, size_t compressedSize, size_t maxUncompressedSize) { if (compressedSize == 0 || myErrorCode == ERROR_CORRUPTED_FILE) { return 0; } if (targetBuffer != 0) { unsigned char *sourceBuffer = new unsigned char[compressedSize]; myTargetBuffer = targetBuffer; myTargetBufferEnd = targetBuffer + maxUncompressedSize; myTargetBufferPtr = targetBuffer; if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) { const size_t trailSize = sizeOfTrailingEntries(sourceBuffer, compressedSize); if (trailSize < compressedSize) { bitsDecompress(BitReader(sourceBuffer, compressedSize - trailSize)); } else { myErrorCode = ERROR_CORRUPTED_FILE; } } delete[] sourceBuffer; } else { myTargetBuffer = 0; myTargetBufferEnd = 0; myTargetBufferPtr = 0; } return myTargetBufferPtr - myTargetBuffer; }
void FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream) { std::string language = book.language(); std::string encoding = book.encoding(); if (!encoding.empty() && !language.empty()) { return; } PluginCollection &collection = PluginCollection::Instance(); if (language.empty()) { language = collection.DefaultLanguageOption.value(); } if (encoding.empty()) { encoding = collection.DefaultEncodingOption.value(); } if (collection.LanguageAutoDetectOption.value() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { if (!info->Language.empty()) { language = info->Language; } encoding = info->Encoding; if ((encoding == "US-ASCII") || (encoding == "ISO-8859-1")) { encoding = "windows-1252"; } } } book.setEncoding(encoding); book.setLanguage(language); }
size_t DocDecompressor::decompress(ZLInputStream &stream, char *targetBuffer, size_t compressedSize, size_t maxUncompressedSize) { const unsigned char *sourceBuffer = new unsigned char[compressedSize]; const unsigned char *sourceBufferEnd = sourceBuffer + compressedSize; const unsigned char *sourcePtr = sourceBuffer; unsigned char *targetBufferEnd = (unsigned char*)targetBuffer + maxUncompressedSize; unsigned char *targetPtr = (unsigned char*)targetBuffer; if (stream.read((char*)sourceBuffer, compressedSize) == compressedSize) { unsigned char token; unsigned short copyLength, N, shift; unsigned char *shifted; while ((sourcePtr < sourceBufferEnd) && (targetPtr < targetBufferEnd)) { token = *(sourcePtr++); switch (TOKEN_CODE[token]) { case 0: *(targetPtr++) = token; break; case 1: if ((sourcePtr + token > sourceBufferEnd) || (targetPtr + token > targetBufferEnd)) { goto endOfLoop; } memcpy(targetPtr, sourcePtr, token); sourcePtr += token; targetPtr += token; break; case 2: if (targetPtr + 2 > targetBufferEnd) { goto endOfLoop; } *(targetPtr++) = ' '; *(targetPtr++) = token ^ 0x80; break; case 3: if (sourcePtr + 1 > sourceBufferEnd) { goto endOfLoop; } N = 256 * token + *(sourcePtr++); copyLength = (N & 7) + 3; if (targetPtr + copyLength > targetBufferEnd) { goto endOfLoop; } shift = (N & 0x3fff) / 8; shifted = targetPtr - shift; if ((char*)shifted >= targetBuffer) { for (short i = 0; i < copyLength; i++) { *(targetPtr++) = *(shifted++); } } break; } } } endOfLoop: delete[] sourceBuffer; return targetPtr - (unsigned char*)targetBuffer; }
bool ZLTarHeader::read(ZLInputStream &stream) { size_t startOffset = stream.offset(); char fileName[101]; stream.read(fileName, 100); if (fileName[0] == '\0') { return false; } fileName[100] = '\0'; if (Name.empty()) { Name = fileName; } stream.seek(24, false); char fileSizeString[12]; stream.read(fileSizeString, 12); Size = 0; for (int i = 0; i < 12; ++i) { if (!isdigit(fileSizeString[i])) { break; } Size *= 8; Size += fileSizeString[i] - '0'; } stream.seek(20, false); char linkFlag; stream.read(&linkFlag, 1); IsRegularFile = (linkFlag == '\0') || (linkFlag == '0'); stream.seek(355, false); if (((linkFlag == 'L') || (linkFlag == 'K')) && (Name == "././@LongLink") && (Size < 10240)) { Name.erase(); Name.append(Size - 1, '\0'); stream.read(const_cast<char*>(Name.data()), Size - 1); const int skip = 512 - (Size & 0x1ff); stream.seek(skip + 1, false); return (stream.offset() == startOffset + Size + skip + 512) && read(stream); } else { DataOffset = stream.offset(); return DataOffset == startOffset + 512; } }
void PdbUtil::readUnsignedLongLE(ZLInputStream &stream, unsigned long &N) { unsigned char data[4]; stream.read((char*)data, 4); N = (((unsigned long)data[3]) << 24) + (((unsigned long)data[2]) << 16) + (((unsigned long)data[1]) << 8) + (unsigned long)data[0]; }
static unsigned short readUnsignedWord(ZLInputStream &stream) { unsigned char buffer[2]; stream.read((char*)buffer, 2); unsigned short result = buffer[1]; result = result << 8; result += buffer[0]; return result; }
void PdbUtil::readUnsignedShort(ZLInputStream &stream, unsigned short &N) { unsigned char data[2]; stream.read((char*)data, 2); N = (((unsigned short)data[0]) << 8) + data[1]; /* stream.read((char*)&N + 1, 1); stream.read((char*)&N, 1); */ }
static unsigned long long readEncodedInteger(ZLInputStream &stream) { unsigned long long result = 0; char part; do { result = result << 7; stream.read(&part, 1); result += part & 0x7F; } while (part & -0x80); return result; }
unsigned long ZLZipHeader::readLong(ZLInputStream &stream) { char buffer[4]; stream.read(buffer, 4); return ((((unsigned long)buffer[3]) & 0xFF) << 24) + ((((unsigned long)buffer[2]) & 0xFF) << 16) + ((((unsigned long)buffer[1]) & 0xFF) << 8) + ((unsigned long)buffer[0] & 0xFF); }
static std::string readNTString(ZLInputStream &stream) { std::string s; char c; while (stream.read(&c, 1) == 1) { if (c == '\0') { break; } else { s += c; } } return CHMReferenceCollection::fullReference("/", s); }
void TxtReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::string str; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; const char *end = buffer + length; for (char *ptr = start; ptr != end; ++ptr) { if (*ptr == '\n' || *ptr == '\r') { bool skipNewLine = false; if (*ptr == '\r' && (ptr + 1) != end && *(ptr + 1) == '\n') { skipNewLine = true; *ptr = '\n'; } if (start != ptr) { str.erase(); myConverter->convert(str, start, ptr + 1); characterDataHandler(str); } if (skipNewLine) { ++ptr; } start = ptr + 1; newLineHandler(); } else if (isspace((unsigned char)*ptr)) { if (*ptr != '\t') { *ptr = ' '; } } else { } } if (start != end) { str.erase(); myConverter->convert(str, start, end); characterDataHandler(str); } } while (length == BUFSIZE); delete[] buffer; endDocumentHandler(); stream.close(); }
void PdbUtil::readUnsignedLong(ZLInputStream &stream, unsigned long &N) { unsigned char data[4]; stream.read((char*)data, 4); N = (((unsigned long)data[0]) << 24) + (((unsigned long)data[1]) << 16) + (((unsigned long)data[2]) << 8) + (unsigned long)data[3]; /* stream.read((char*)&N + 3, 1); stream.read((char*)&N + 2, 1); stream.read((char*)&N + 1, 1); stream.read((char*)&N, 1); */ }
void StyleSheetParser::parse(ZLInputStream &stream) { if (stream.open()) { char *buffer = new char[1024]; while (true) { int len = stream.read(buffer, 1024); if (len == 0) { break; } parse(buffer, len); } delete[] buffer; stream.close(); } }
HuffDecompressor::HuffDecompressor(ZLInputStream& stream, const std::vector<unsigned long>::const_iterator beginIt, const std::vector<unsigned long>::const_iterator endIt, const unsigned long endHuffDataOffset, const unsigned long extraFlags) : myExtraFlags(extraFlags), myErrorCode(ERROR_NONE) { const unsigned long huffHeaderOffset = *beginIt; const unsigned long huffRecordsNumber = endIt - beginIt; const unsigned long huffDataOffset = *(beginIt + 1); stream.seek(huffHeaderOffset, true); stream.seek(16, false); unsigned long cacheTableOffset, baseTableOffset; PdbUtil::readUnsignedLongBE(stream, cacheTableOffset); PdbUtil::readUnsignedLongBE(stream, baseTableOffset); myCacheTable = new unsigned long[256]; stream.seek(huffHeaderOffset + cacheTableOffset, true); for (size_t i = 0; i < 256; ++i) { PdbUtil::readUnsignedLongLE(stream, myCacheTable[i]); //LE } myBaseTable = new unsigned long[64]; stream.seek(huffHeaderOffset + baseTableOffset, true); for (size_t i = 0; i < 64; ++i) { PdbUtil::readUnsignedLongLE(stream, myBaseTable[i]); //LE } stream.seek(huffDataOffset + 12, true); PdbUtil::readUnsignedLongBE(stream, myEntryBits); size_t huffDataSize = endHuffDataOffset - huffDataOffset; myData = new unsigned char[huffDataSize]; stream.seek(huffDataOffset, true); if (huffDataSize == stream.read((char*)myData, huffDataSize)) { myDicts = new unsigned char* [huffRecordsNumber - 1]; for(size_t i = 0; i < huffRecordsNumber - 1; ++i) { size_t shift = *(beginIt + i + 1) - huffDataOffset; myDicts[i] = myData + shift; } } else { myErrorCode = ERROR_CORRUPTED_FILE; } myTargetBuffer = 0; myTargetBufferEnd = 0; myTargetBufferPtr = 0; }
static void readLine(ZLInputStream &stream, std::string &buffer) { buffer.clear(); char ch; while (1) { if (stream.read(&ch, 1) != 1) { return; } if ((ch == 10) || (ch == 13)) { if (!buffer.empty()) { return; } } else { buffer += ch; } } }
bool PPLBookReader::readDocument(ZLInputStream &stream) { std::cout<<"PPLBookReader::readDocument\n"; if (!stream.open()) { return false; } myModelReader.setMainTextModel(); myModelReader.pushKind(REGULAR); myCurrentParagraph.erase(); myEmptyLineCounter = 0; // "PPL\r\n" stream.seek(5); size_t size; do { size = stream.read(myBuffer, BUFFER_SIZE); myBuffer[size] = '\0'; const char *start = myBuffer; const char *end = myBuffer + size; const char *eol; do { eol = strchr(start, '\n'); if (eol != 0) { if (start < eol) { myConverter->convert(myCurrentParagraph, start, eol); } addParagraph(); start = eol + 1; } else { if (start < end) { myConverter->convert(myCurrentParagraph, start, end); } } } while (eol != 0); } while (size == BUFFER_SIZE); addParagraph(); stream.close(); return true; }
void TxtReaderCoreUtf16::readDocument(ZLInputStream &stream) { const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::string str; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; const char *end = buffer + length; for (char *ptr = start; ptr < end; ptr += 2) { const char chr = getAscii(ptr); if (chr == '\n' || chr == '\r') { bool skipNewLine = false; if (chr == '\r' && ptr + 2 != end && getAscii(ptr + 2) == '\n') { skipNewLine = true; setAscii(ptr, '\n'); } if (start != ptr) { str.erase(); myReader.myConverter->convert(str, start, ptr + 2); myReader.characterDataHandler(str); } if (skipNewLine) { ptr += 2; } start = ptr + 2; myReader.newLineHandler(); } else if (chr != 0 && isspace(chr)) { if (chr != '\t') { setAscii(ptr, ' '); } } } if (start != end) { str.erase(); myReader.myConverter->convert(str, start, end); myReader.characterDataHandler(str); } } while (length == BUFSIZE); delete[] buffer; }
size_t ZLZDecompressor::decompress(ZLInputStream &stream, char *buffer, size_t maxSize) { while ((myBuffer.length() < maxSize) && (myAvailableSize > 0)) { size_t size = std::min(myAvailableSize, (size_t)IN_BUFFER_SIZE); myZStream->next_in = (Bytef*)myInBuffer; myZStream->avail_in = stream.read(myInBuffer, size); if (myZStream->avail_in == size) { myAvailableSize -= size; } else { myAvailableSize = 0; } while (myZStream->avail_in == 0) { break; } while (myZStream->avail_in > 0) { myZStream->avail_out = OUT_BUFFER_SIZE; myZStream->next_out = (Bytef*)myOutBuffer; int code = ::inflate(myZStream, Z_SYNC_FLUSH); if ((code != Z_OK) && (code != Z_STREAM_END)) { break; } if (OUT_BUFFER_SIZE == myZStream->avail_out) { break; } myBuffer.append(myOutBuffer, OUT_BUFFER_SIZE - myZStream->avail_out); if (code == Z_STREAM_END) { myAvailableSize = 0; stream.seek(0 - myZStream->avail_in, false); break; } } } size_t realSize = std::min(maxSize, myBuffer.length()); if (buffer != 0) { memcpy(buffer, myBuffer.data(), realSize); } myBuffer.erase(0, realSize); return realSize; }
bool FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force) { std::string language = book.language(); std::string encoding = book.encoding(); if (!force && !encoding.empty()) { return true; } bool detected = false; PluginCollection &collection = PluginCollection::Instance(); if (encoding.empty()) { encoding = ZLEncodingConverter::UTF8; } if (collection.isLanguageAutoDetectEnabled() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const std::size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { detected = true; if (!info->Language.empty()) { language = info->Language; } encoding = info->Encoding; if (encoding == ZLEncodingConverter::ASCII || encoding == "iso-8859-1") { encoding = "windows-1252"; } } } book.setEncoding(encoding); book.setLanguage(language); return detected; }
static std::string readString(ZLInputStream &stream, std::size_t length) { std::string string(length, ' '); stream.read(const_cast<char*>(string.data()), length); return string; }
void HtmlReader::readDocument(ZLInputStream &stream) { std::cout<<"HtmlReader\n"; if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = PS_TEXT; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = PS_TEXT; } } else { if (*ptr == ';') { currentString.append(start, ptr - start); int number = specialSymbolNumber(state_special, currentString); if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number); characterDataHandler(buffer, len, false); } else { currentString = "&" + currentString + ";"; characterDataHandler(currentString.c_str(), currentString.length(), false); } currentString.erase(); start = ptr + 1; state = PS_TEXT; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = PS_TEXT; } } break; case PS_TAGSTART: state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if ((endOfComment[0] == '\0') && (*ptr != '-')) { state = PS_TAGNAME; } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_TAGNAME: if ((*ptr == '>') || isspace(*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) { if (ptr != start) { currentString.append(start, ptr - start); for (unsigned int i = 0; i < currentString.length(); ++i) { currentString[i] = toupper(currentString[i]); } currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if ((ptr == start) || (quotationCounter > 0)) { ++quotationCounter; } } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) { if (ptr != start) { currentString.append(start, ptr - start); if (currentString[0] == '"') { currentString = currentString.substr(1, currentString.length() - 2); } currentTag.setLastAttributeValue(currentString); currentString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: case PS_SPECIAL: currentString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: break; } } } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }
unsigned short ZLZipHeader::readShort(ZLInputStream &stream) { char buffer[2]; stream.read(buffer, 2); return ((((unsigned short)buffer[1]) & 0xFF) << 8) + ((unsigned short)buffer[0] & 0xFF); }
void HtmlReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; std::string attributeValueString; std::string specialString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const std::size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::size_t length; std::size_t offset = 0; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; currentTag.Offset = offset + (ptr - buffer); } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (std::isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (std::isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else { if (*ptr == ';') { specialString.append(start, ptr - start); const int number = specialSymbolNumber(state_special, specialString); if (128 <= number && number <= 159) { char ch = number; if (state == PS_SPECIAL) { characterDataHandler(&ch, 1, true); } else { myConverter->convert(attributeValueString, &ch, &ch + 1); } } else if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); if (state == PS_SPECIAL) { characterDataHandler(buffer, len, false); } else { attributeValueString.append(buffer, len); } } else { specialString = "&" + specialString + ";"; if (state == PS_SPECIAL) { characterDataHandler(specialString.c_str(), specialString.length(), false); } else { attributeValueString += specialString; } } specialString.erase(); start = ptr + 1; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } break; case PS_TAGSTART: state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if (endOfComment[0] == '\0' && *ptr != '-') { state = PS_TAGNAME; } else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_WAIT_END_OF_TAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; case PS_TAGNAME: if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); ZLStringUtil::asciiToLowerInline(currentString); currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { ++quotationCounter; } } else if (*ptr == '&') { currentString.append(start, ptr - start); start = ptr + 1; appendString(attributeValueString, currentString); state = PS_SPECIAL_IN_ATTRIBUTEVALUE; state_special = ST_UNKNOWN; } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); appendString(attributeValueString, currentString); if (attributeValueString[0] == '"') { attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); } currentTag.setLastAttributeValue(attributeValueString); attributeValueString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: currentString.append(start, endOfBuffer - start); break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: specialString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: case PS_WAIT_END_OF_TAG: break; } } offset += length; } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }
void PlainTextFormatDetector::detect(ZLInputStream &stream, PlainTextFormat &format) { if (!stream.open()) { return; } const unsigned int tableSize = 10; unsigned int lineCounter = 0; int emptyLineCounter = -1; unsigned int stringsWithLengthLessThan81Counter = 0; unsigned int stringIndentTable[tableSize] = { 0 }; unsigned int emptyLinesTable[tableSize] = { 0 }; unsigned int emptyLinesBeforeShortStringTable[tableSize] = { 0 }; bool currentLineIsEmpty = true; unsigned int currentLineLength = 0; unsigned int currentLineIndent = 0; int currentNumberOfEmptyLines = -1; char *buffer = new char[BUFFER_SIZE]; int length; char previous = 0; do { length = stream.read(buffer, BUFFER_SIZE); const char *end = buffer + length; for (const char *ptr = buffer; ptr != end; ++ptr) { ++currentLineLength; if (*ptr == '\n') { ++lineCounter; if (currentLineIsEmpty) { ++emptyLineCounter; ++currentNumberOfEmptyLines; } else { if (currentNumberOfEmptyLines >= 0) { int index = std::min(currentNumberOfEmptyLines, (int)tableSize - 1); emptyLinesTable[index]++; if (currentLineLength < 51) { emptyLinesBeforeShortStringTable[index]++; } } currentNumberOfEmptyLines = -1; } if (currentLineLength < 81) { ++stringsWithLengthLessThan81Counter; } if (!currentLineIsEmpty) { stringIndentTable[std::min(currentLineIndent, tableSize - 1)]++; } currentLineIsEmpty = true; currentLineLength = 0; currentLineIndent = 0; } else if (*ptr == '\r') { continue; } else if (isspace((unsigned char)*ptr)) { if (currentLineIsEmpty) { ++currentLineIndent; } } else { currentLineIsEmpty = false; } previous = *ptr; } } while (length == BUFFER_SIZE); delete[] buffer; unsigned int nonEmptyLineCounter = lineCounter - emptyLineCounter; { unsigned int indent = 0; unsigned int lineWithIndent = 0; for (; indent < tableSize; ++indent) { lineWithIndent += stringIndentTable[indent]; if (lineWithIndent > 0.1 * nonEmptyLineCounter) { break; } } format.IgnoredIndentOption.setValue(indent + 1); } { int breakType = 0; breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE; breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE; if (stringsWithLengthLessThan81Counter >= 0.5 * nonEmptyLineCounter) { breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT; } format.BreakTypeOption.setValue(breakType); } { unsigned int max = 0; unsigned index; int emptyLinesBeforeNewSection = -1; for (index = 2; index < tableSize; ++index) { if (max < emptyLinesBeforeShortStringTable[index]) { max = emptyLinesBeforeShortStringTable[index]; emptyLinesBeforeNewSection = index; } } if (emptyLinesBeforeNewSection > 0) { for (index = tableSize - 1; index > 0; --index) { emptyLinesTable[index - 1] += emptyLinesTable[index]; emptyLinesBeforeShortStringTable[index - 1] += emptyLinesBeforeShortStringTable[index]; } for (index = emptyLinesBeforeNewSection; index < tableSize; ++index) { if ((emptyLinesBeforeShortStringTable[index] > 2) && (emptyLinesBeforeShortStringTable[index] > 0.7 * emptyLinesTable[index])) { break; } } emptyLinesBeforeNewSection = (index == tableSize) ? -1 : (int)index; } format.EmptyLinesBeforeNewSectionOption.setValue(emptyLinesBeforeNewSection); format.CreateContentsTableOption.setValue(emptyLinesBeforeNewSection > 0); } format.InitializedOption.setValue(true); }
bool PmlReader::parseDocument(ZLInputStream &stream) { enum { READ_NORMAL_DATA, READ_TAG, READ_TAG_PARAMETER, } parserState = READ_NORMAL_DATA; size_t tagNameLength = 0; std::string tagName; std::string parameterString; bool startParameterReading = false; size_t tagCounter = 0; static bool FLAG = true; while (!myIsInterrupted) { const char *ptr = myStreamBuffer; const char *end = myStreamBuffer + stream.read(myStreamBuffer, pmlStreamBufferSize); if (ptr == end) { break; } const char *dataStart = ptr; bool readNextChar = true; while (ptr != end) { switch (parserState) { case READ_NORMAL_DATA: if (*ptr == '\n') { if (ptr > dataStart) { processCharData(dataStart, ptr - dataStart); } newLine(); FLAG = true; dataStart = ptr + 1; } else if (FLAG && isspace(*ptr)) { } else { FLAG = false; if (*ptr == '\\') { if (ptr > dataStart) { processCharData(dataStart, ptr - dataStart); } dataStart = ptr + 1; tagName.erase(); parserState = READ_TAG; } } break; case READ_TAG: if ((ptr == dataStart) && (tagName.empty())) { if (*ptr == '\\') { processCharData(ptr, 1); dataStart = ptr + 1; parserState = READ_NORMAL_DATA; } else { tagNameLength = findTagLength(ptr); if (tagNameLength == 0) { dataStart = ptr + 1; parserState = READ_NORMAL_DATA; ++tagCounter; } else { --tagNameLength; } } } else { if (tagNameLength == 0) { tagName.append(dataStart, ptr - dataStart); if (*ptr == '=') { dataStart = ptr + 1; parameterString.erase(); parserState = READ_TAG_PARAMETER; ++tagCounter; } else { readNextChar = false; processTag(tagName); dataStart = ptr; parserState = READ_NORMAL_DATA; ++tagCounter; } } else { --tagNameLength; } } break; case READ_TAG_PARAMETER: if (*ptr == '"') { if (!startParameterReading) { startParameterReading = true; dataStart = ptr + 1; } else { parameterString.append(dataStart, ptr - dataStart); processTag(tagName, parameterString); parserState = READ_NORMAL_DATA; dataStart = ptr + 1; startParameterReading = false; } } break; } if (readNextChar) { ++ptr; } else { readNextChar = true; } } if (dataStart < end) { switch (parserState) { case READ_NORMAL_DATA: processCharData(dataStart, end - dataStart); case READ_TAG: tagName.append(dataStart, end - dataStart); break; case READ_TAG_PARAMETER: parameterString.append(dataStart, end - dataStart); break; default: break; } } } return myIsInterrupted; }