bool FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force) { std::string language = book.language(); if (!force && !language.empty()) { return true; } bool detected = false; PluginCollection &collection = PluginCollection::Instance(); if (collection.isLanguageAutoDetectEnabled() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const std::size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfoForEncoding(encoding, buffer, size, -20000); delete[] buffer; if (!info.isNull()) { detected = true; if (!info->Language.empty()) { language = info->Language; } } } book.setLanguage(language); return detected; }
void FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream) { std::string language = book.language(); std::string encoding = book.encoding(); if (!encoding.empty() && !language.empty()) { return; } PluginCollection &collection = PluginCollection::Instance(); if (language.empty()) { language = collection.DefaultLanguageOption.value(); } if (encoding.empty()) { encoding = collection.DefaultEncodingOption.value(); } if (collection.LanguageAutoDetectOption.value() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { if (!info->Language.empty()) { language = info->Language; } encoding = info->Encoding; if ((encoding == "US-ASCII") || (encoding == "ISO-8859-1")) { encoding = "windows-1252"; } } } book.setEncoding(encoding); book.setLanguage(language); }
void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) { std::string language = book.language(); if (!language.empty()) { return; } PluginCollection &collection = PluginCollection::Instance(); if (language.empty()) { language = collection.DefaultLanguageOption.value(); } if (collection.LanguageAutoDetectOption.value() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { if (!info->Language.empty()) { language = info->Language; } } } book.setLanguage(language); }
ZLZipEntryCache::ZLZipEntryCache(const std::string &containerName, ZLInputStream &containerStream) : myContainerName(containerName) { //ZLLogger::Instance().println("ZipEntryCache", "creating cache for " + containerName); myLastModifiedTime = ZLFile(containerName).lastModified(); if (!containerStream.open()) { return; } ZLZipHeader header; while (header.readFrom(containerStream)) { Info *infoPtr = 0; if (header.Signature == (unsigned long)ZLZipHeader::SignatureLocalFile) { std::string entryName(header.NameLength, '\0'); if ((unsigned int)containerStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) { entryName = AndroidUtil::convertNonUtfString(entryName); Info &info = myInfoMap[entryName]; info.Offset = containerStream.offset() + header.ExtraLength; info.CompressionMethod = header.CompressionMethod; info.CompressedSize = header.CompressedSize; info.UncompressedSize = header.UncompressedSize; infoPtr = &info; } } ZLZipHeader::skipEntry(containerStream, header); if (infoPtr != 0) { infoPtr->UncompressedSize = header.UncompressedSize; } } containerStream.close(); }
void EReaderPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { if (!stream.open()) { //TODO maybe anything else opens stream return; } BookReader bookReader(model); PmlBookReader pmlBookReader(bookReader, format, encoding); bookReader.setMainTextModel(); pmlBookReader.readDocument(stream); EReaderStream &estream = (EReaderStream&)stream; const std::map<std::string, EReaderStream::ImageInfo>& imageIds = estream.images(); for(std::map<std::string, EReaderStream::ImageInfo>::const_iterator it = imageIds.begin(); it != imageIds.end(); ++it) { const std::string id = it->first; bookReader.addImage(id, new ZLFileImage(ZLFile(file.path(), it->second.Type), it->second.Offset, it->second.Size)); } const std::map<std::string, unsigned short>& footnoteIds = estream.footnotes(); for(std::map<std::string, unsigned short>::const_iterator it = footnoteIds.begin(); it != footnoteIds.end(); ++it) { const std::string id = it->first; if (estream.switchStreamDestination(EReaderStream::FOOTNOTE, id)) { bookReader.setFootnoteTextModel(id); bookReader.addHyperlinkLabel(id); pmlBookReader.readDocument(estream); } } stream.close(); }
ZLZipEntryCache::ZLZipEntryCache(ZLInputStream &baseStream) { if (!baseStream.open()) { return; } ZLZipHeader header; while (header.readFrom(baseStream)) { Info *infoPtr = 0; if (header.Signature == ZLZipHeader::SignatureLocalFile) { std::string entryName(header.NameLength, '\0'); if ((unsigned int)baseStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) { Info &info = myInfoMap[entryName]; info.Offset = baseStream.offset() + header.ExtraLength; info.CompressionMethod = header.CompressionMethod; info.CompressedSize = header.CompressedSize; info.UncompressedSize = header.UncompressedSize; infoPtr = &info; } } ZLZipHeader::skipEntry(baseStream, header); if (infoPtr != 0) { infoPtr->UncompressedSize = header.UncompressedSize; } } baseStream.close(); }
void TxtReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); myCore->readDocument(stream); endDocumentHandler(); stream.close(); }
void PalmDocPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const { stream.open(); bool readAsPalmDoc = ((PalmDocStream&)stream).hasExtraSections(); stream.close(); if (readAsPalmDoc) { MobipocketHtmlBookReader(file, model, format, encoding).readDocument(stream); } else { SimplePdbPlugin::readDocumentInternal(file, model, format, encoding, stream); } }
void TxtReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::string str; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; const char *end = buffer + length; for (char *ptr = start; ptr != end; ++ptr) { if (*ptr == '\n' || *ptr == '\r') { bool skipNewLine = false; if (*ptr == '\r' && (ptr + 1) != end && *(ptr + 1) == '\n') { skipNewLine = true; *ptr = '\n'; } if (start != ptr) { str.erase(); myConverter->convert(str, start, ptr + 1); characterDataHandler(str); } if (skipNewLine) { ++ptr; } start = ptr + 1; newLineHandler(); } else if (isspace((unsigned char)*ptr)) { if (*ptr != '\t') { *ptr = ' '; } } else { } } if (start != end) { str.erase(); myConverter->convert(str, start, end); characterDataHandler(str); } } while (length == BUFSIZE); delete[] buffer; endDocumentHandler(); stream.close(); }
void StyleSheetParser::parse(ZLInputStream &stream) { if (stream.open()) { char *buffer = new char[1024]; while (true) { int len = stream.read(buffer, 1024); if (len == 0) { break; } parse(buffer, len); } delete[] buffer; stream.close(); } }
ZLTarHeaderCache::ZLTarHeaderCache(ZLInputStream &baseStream) { if (!baseStream.open()) { return; } ZLTarHeader header; while (header.read(baseStream)) { if (header.IsRegularFile) { myHeaderMap[header.Name] = header; } baseStream.seek((header.Size + 0x1ff) & -0x200, false); header.erase(); } baseStream.close(); }
bool PPLBookReader::readDocument(ZLInputStream &stream) { std::cout<<"PPLBookReader::readDocument\n"; if (!stream.open()) { return false; } myModelReader.setMainTextModel(); myModelReader.pushKind(REGULAR); myCurrentParagraph.erase(); myEmptyLineCounter = 0; // "PPL\r\n" stream.seek(5); size_t size; do { size = stream.read(myBuffer, BUFFER_SIZE); myBuffer[size] = '\0'; const char *start = myBuffer; const char *end = myBuffer + size; const char *eol; do { eol = strchr(start, '\n'); if (eol != 0) { if (start < eol) { myConverter->convert(myCurrentParagraph, start, eol); } addParagraph(); start = eol + 1; } else { if (start < end) { myConverter->convert(myCurrentParagraph, start, end); } } } while (eol != 0); } while (size == BUFFER_SIZE); addParagraph(); stream.close(); return true; }
bool FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force) { std::string language = book.language(); std::string encoding = book.encoding(); if (!force && !encoding.empty()) { return true; } bool detected = false; PluginCollection &collection = PluginCollection::Instance(); if (encoding.empty()) { encoding = ZLEncodingConverter::UTF8; } if (collection.isLanguageAutoDetectEnabled() && stream.open()) { static const int BUFSIZE = 65536; char *buffer = new char[BUFSIZE]; const std::size_t size = stream.read(buffer, BUFSIZE); stream.close(); shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size); delete[] buffer; if (!info.isNull()) { detected = true; if (!info->Language.empty()) { language = info->Language; } encoding = info->Encoding; if (encoding == ZLEncodingConverter::ASCII || encoding == "iso-8859-1") { encoding = "windows-1252"; } } } book.setEncoding(encoding); book.setLanguage(language); return detected; }
void HtmlReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; std::string attributeValueString; std::string specialString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const std::size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::size_t length; std::size_t offset = 0; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; currentTag.Offset = offset + (ptr - buffer); } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (std::isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (std::isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else { if (*ptr == ';') { specialString.append(start, ptr - start); const int number = specialSymbolNumber(state_special, specialString); if (128 <= number && number <= 159) { char ch = number; if (state == PS_SPECIAL) { characterDataHandler(&ch, 1, true); } else { myConverter->convert(attributeValueString, &ch, &ch + 1); } } else if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); if (state == PS_SPECIAL) { characterDataHandler(buffer, len, false); } else { attributeValueString.append(buffer, len); } } else { specialString = "&" + specialString + ";"; if (state == PS_SPECIAL) { characterDataHandler(specialString.c_str(), specialString.length(), false); } else { attributeValueString += specialString; } } specialString.erase(); start = ptr + 1; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } break; case PS_TAGSTART: state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if (endOfComment[0] == '\0' && *ptr != '-') { state = PS_TAGNAME; } else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_WAIT_END_OF_TAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; case PS_TAGNAME: if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); ZLStringUtil::asciiToLowerInline(currentString); currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { ++quotationCounter; } } else if (*ptr == '&') { currentString.append(start, ptr - start); start = ptr + 1; appendString(attributeValueString, currentString); state = PS_SPECIAL_IN_ATTRIBUTEVALUE; state_special = ST_UNKNOWN; } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); appendString(attributeValueString, currentString); if (attributeValueString[0] == '"') { attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); } currentTag.setLastAttributeValue(attributeValueString); attributeValueString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: currentString.append(start, endOfBuffer - start); break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: specialString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: case PS_WAIT_END_OF_TAG: break; } } offset += length; } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }
void HtmlReader::readDocument(ZLInputStream &stream) { std::cout<<"HtmlReader\n"; if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = PS_TEXT; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = PS_TEXT; } } else { if (*ptr == ';') { currentString.append(start, ptr - start); int number = specialSymbolNumber(state_special, currentString); if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number); characterDataHandler(buffer, len, false); } else { currentString = "&" + currentString + ";"; characterDataHandler(currentString.c_str(), currentString.length(), false); } currentString.erase(); start = ptr + 1; state = PS_TEXT; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = PS_TEXT; } } break; case PS_TAGSTART: state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if ((endOfComment[0] == '\0') && (*ptr != '-')) { state = PS_TAGNAME; } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_TAGNAME: if ((*ptr == '>') || isspace(*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) { if (ptr != start) { currentString.append(start, ptr - start); for (unsigned int i = 0; i < currentString.length(); ++i) { currentString[i] = toupper(currentString[i]); } currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if ((ptr == start) || (quotationCounter > 0)) { ++quotationCounter; } } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) { if (ptr != start) { currentString.append(start, ptr - start); if (currentString[0] == '"') { currentString = currentString.substr(1, currentString.length() - 2); } currentTag.setLastAttributeValue(currentString); currentString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: case PS_SPECIAL: currentString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: break; } } } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }