Пример #1
0
void FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream) {
	std::string language = book.language();
	if (!language.empty()) {
		return;
	}

	PluginCollection &collection = PluginCollection::Instance();
	if (language.empty()) {
		language = collection.DefaultLanguageOption.value();
	}
	if (collection.LanguageAutoDetectOption.value() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			if (!info->Language.empty()) {
				language = info->Language;
			}
		}
	}
	book.setLanguage(language);
}
Пример #2
0
bool FormatPlugin::detectLanguage(Book &book, ZLInputStream &stream, const std::string &encoding, bool force) {
	std::string language = book.language();
	if (!force && !language.empty()) {
		return true;
	}

	bool detected = false;

	PluginCollection &collection = PluginCollection::Instance();
	if (collection.isLanguageAutoDetectEnabled() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const std::size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfoForEncoding(encoding, buffer, size, -20000);
		delete[] buffer;
		if (!info.isNull()) {
			detected = true;
			if (!info->Language.empty()) {
				language = info->Language;
			}
		}
	}
	book.setLanguage(language);

	return detected;
}
Пример #3
0
ZLZipEntryCache::ZLZipEntryCache(const std::string &containerName, ZLInputStream &containerStream) : myContainerName(containerName) {
	//ZLLogger::Instance().println("ZipEntryCache", "creating cache for " + containerName);
	myLastModifiedTime = ZLFile(containerName).lastModified();
	if (!containerStream.open()) {
		return;
	}

	ZLZipHeader header;
	while (header.readFrom(containerStream)) {
		Info *infoPtr = 0;
		if (header.Signature == (unsigned long)ZLZipHeader::SignatureLocalFile) {
			std::string entryName(header.NameLength, '\0');
			if ((unsigned int)containerStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) {
				entryName = AndroidUtil::convertNonUtfString(entryName);
				Info &info = myInfoMap[entryName];
				info.Offset = containerStream.offset() + header.ExtraLength;
				info.CompressionMethod = header.CompressionMethod;
				info.CompressedSize = header.CompressedSize;
				info.UncompressedSize = header.UncompressedSize;
				infoPtr = &info;
			}
		}
		ZLZipHeader::skipEntry(containerStream, header);
		if (infoPtr != 0) {
			infoPtr->UncompressedSize = header.UncompressedSize;
		}
	}
	containerStream.close();
}
Пример #4
0
void FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream) {
	std::string language = book.language();
	std::string encoding = book.encoding();
	if (!encoding.empty() && !language.empty()) {
		return;
	}

	PluginCollection &collection = PluginCollection::Instance();
	if (language.empty()) {
		language = collection.DefaultLanguageOption.value();
	}
	if (encoding.empty()) {
		encoding = collection.DefaultEncodingOption.value();
	}
	if (collection.LanguageAutoDetectOption.value() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info =
			ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			if (!info->Language.empty()) {
				language = info->Language;
			}
			encoding = info->Encoding;
			if ((encoding == "US-ASCII") || (encoding == "ISO-8859-1")) {
				encoding = "windows-1252";
			}
		}
	}
	book.setEncoding(encoding);
	book.setLanguage(language);
}
Пример #5
0
void EReaderPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const {
	if (!stream.open())	{
		//TODO maybe anything else opens stream
		return;
	}
	BookReader bookReader(model);
	PmlBookReader pmlBookReader(bookReader, format, encoding);
	bookReader.setMainTextModel();
	pmlBookReader.readDocument(stream);
	EReaderStream &estream = (EReaderStream&)stream;
	const std::map<std::string, EReaderStream::ImageInfo>& imageIds = estream.images();
	for(std::map<std::string, EReaderStream::ImageInfo>::const_iterator it = imageIds.begin(); it != imageIds.end(); ++it) {
		const std::string id = it->first;
		bookReader.addImage(id, new ZLFileImage(ZLFile(file.path(), it->second.Type), it->second.Offset, it->second.Size));
	}
	const std::map<std::string, unsigned short>& footnoteIds = estream.footnotes();
	for(std::map<std::string, unsigned short>::const_iterator it = footnoteIds.begin(); it != footnoteIds.end(); ++it) {
		const std::string id = it->first;
		if (estream.switchStreamDestination(EReaderStream::FOOTNOTE, id)) {
			bookReader.setFootnoteTextModel(id);
			bookReader.addHyperlinkLabel(id);
			pmlBookReader.readDocument(estream);
		}
	}
	stream.close();
}
Пример #6
0
ZLZipEntryCache::ZLZipEntryCache(ZLInputStream &baseStream) {
	if (!baseStream.open()) {
		return;
	}

	ZLZipHeader header;
	while (header.readFrom(baseStream)) {
		Info *infoPtr = 0;
		if (header.Signature == ZLZipHeader::SignatureLocalFile) {
			std::string entryName(header.NameLength, '\0');
			if ((unsigned int)baseStream.read((char*)entryName.data(), header.NameLength) == header.NameLength) {
				Info &info = myInfoMap[entryName];
				info.Offset = baseStream.offset() + header.ExtraLength;
				info.CompressionMethod = header.CompressionMethod;
				info.CompressedSize = header.CompressedSize;
				info.UncompressedSize = header.UncompressedSize;
				infoPtr = &info;
			}
		}
		ZLZipHeader::skipEntry(baseStream, header);
		if (infoPtr != 0) {
			infoPtr->UncompressedSize = header.UncompressedSize;
		}
	}
	baseStream.close();
}
Пример #7
0
void TxtReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}
	startDocumentHandler();
	myCore->readDocument(stream);
	endDocumentHandler();
	stream.close();
}
Пример #8
0
void PalmDocPlugin::readDocumentInternal(const ZLFile &file, BookModel &model, const PlainTextFormat &format, const std::string &encoding, ZLInputStream &stream) const {
	stream.open();
	bool readAsPalmDoc = ((PalmDocStream&)stream).hasExtraSections();
	stream.close();
	if (readAsPalmDoc) {
		MobipocketHtmlBookReader(file, model, format, encoding).readDocument(stream);
	} else {
		SimplePdbPlugin::readDocumentInternal(file, model, format, encoding, stream);
	}
}
Пример #9
0
void TxtReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}

	startDocumentHandler();

	const size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::string str;
	size_t length;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		const char *end = buffer + length;
		for (char *ptr = start; ptr != end; ++ptr) {
			if (*ptr == '\n' || *ptr == '\r') {
				bool skipNewLine = false;
				if (*ptr == '\r' && (ptr + 1) != end && *(ptr + 1) == '\n') {
					skipNewLine = true;
					*ptr = '\n';
				}
				if (start != ptr) {
					str.erase();
					myConverter->convert(str, start, ptr + 1);
					characterDataHandler(str);
				}
				if (skipNewLine) {
					++ptr;
				}
				start = ptr + 1;
				newLineHandler();
			} else if (isspace((unsigned char)*ptr)) {
				if (*ptr != '\t') {
					*ptr = ' ';
				}
			} else {
			}
		}
		if (start != end) {
			str.erase();
			myConverter->convert(str, start, end);
			characterDataHandler(str);
		}
	} while (length == BUFSIZE);
	delete[] buffer;

	endDocumentHandler();

	stream.close();
}
Пример #10
0
void StyleSheetParser::parse(ZLInputStream &stream) {
	if (stream.open()) {
		char *buffer = new char[1024];
		while (true) {
			int len = stream.read(buffer, 1024);
			if (len == 0) {
				break;
			}
			parse(buffer, len);
		}
		delete[] buffer;
		stream.close();
	}
}
Пример #11
0
ZLTarHeaderCache::ZLTarHeaderCache(ZLInputStream &baseStream) {
	if (!baseStream.open()) {
		return;
	}

	ZLTarHeader header;
	while (header.read(baseStream)) {
		if (header.IsRegularFile) {
			myHeaderMap[header.Name] = header;
		}
		baseStream.seek((header.Size + 0x1ff) & -0x200, false);
		header.erase();
	}
	baseStream.close();
}
Пример #12
0
bool PPLBookReader::readDocument(ZLInputStream &stream) {
	std::cout<<"PPLBookReader::readDocument\n";
	if (!stream.open()) {
		return false;
	}

	myModelReader.setMainTextModel();
	myModelReader.pushKind(REGULAR);
	myCurrentParagraph.erase();
	myEmptyLineCounter = 0;

	// "PPL\r\n"
	stream.seek(5);

	size_t size;
	do {
		size = stream.read(myBuffer, BUFFER_SIZE);
		myBuffer[size] = '\0';

		const char *start = myBuffer;
		const char *end = myBuffer + size;
		const char *eol;
		do {
			eol = strchr(start, '\n');
			if (eol != 0) {
				if (start < eol) {
					myConverter->convert(myCurrentParagraph, start, eol);
				}
				addParagraph();
				start = eol + 1;
			} else {
				if (start < end) {
					myConverter->convert(myCurrentParagraph, start, end);
				}
			}
		} while (eol != 0);
	} while (size == BUFFER_SIZE);

	addParagraph();

	stream.close();

	return true;
}
Пример #13
0
bool FormatPlugin::detectEncodingAndLanguage(Book &book, ZLInputStream &stream, bool force) {
	std::string language = book.language();
	std::string encoding = book.encoding();

	if (!force && !encoding.empty()) {
		return true;
	}

	bool detected = false;
	PluginCollection &collection = PluginCollection::Instance();
	if (encoding.empty()) {
		encoding = ZLEncodingConverter::UTF8;
	}
	if (collection.isLanguageAutoDetectEnabled() && stream.open()) {
		static const int BUFSIZE = 65536;
		char *buffer = new char[BUFSIZE];
		const std::size_t size = stream.read(buffer, BUFSIZE);
		stream.close();
		shared_ptr<ZLLanguageDetector::LanguageInfo> info = ZLLanguageDetector().findInfo(buffer, size);
		delete[] buffer;
		if (!info.isNull()) {
			detected = true;
			if (!info->Language.empty()) {
				language = info->Language;
			}
			encoding = info->Encoding;
			if (encoding == ZLEncodingConverter::ASCII || encoding == "iso-8859-1") {
				encoding = "windows-1252";
			}
		}
	}
	book.setEncoding(encoding);
	book.setLanguage(language);

	return detected;
}
Пример #14
0
void HtmlReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}

	startDocumentHandler();

	ParseState state = PS_TEXT;
	SpecialType state_special = ST_UNKNOWN;
	std::string currentString;
	std::string attributeValueString;
	std::string specialString;
	int quotationCounter = 0;
	HtmlTag currentTag;
	char endOfComment[2] = "\0";

	const std::size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::size_t length;
	std::size_t offset = 0;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		char *endOfBuffer = buffer + length;
		for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
			switch (state) {
				case PS_TEXT:
					if (*ptr == '<') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_TAGSTART;
						currentTag.Offset = offset + (ptr - buffer);
					}
					if (*ptr == '&') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_SPECIAL;
						state_special = ST_UNKNOWN;
					}
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					if (state_special == ST_UNKNOWN) {
						if (*ptr == '#') {
							state_special = ST_NUM;
						} else if (std::isalpha(*ptr)) {
							state_special = ST_NAME;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else if (state_special == ST_NUM) {
						if (*ptr == 'x') {
							state_special = ST_HEX;
						} else if (std::isdigit(*ptr)) {
							state_special = ST_DEC;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else {
						if (*ptr == ';') {
							specialString.append(start, ptr - start);
							const int number = specialSymbolNumber(state_special, specialString);
							if (128 <= number && number <= 159) {
								char ch = number;
								if (state == PS_SPECIAL) {
									characterDataHandler(&ch, 1, true);
								} else {
									myConverter->convert(attributeValueString, &ch, &ch + 1);
								}
							} else if (number != 0) {
								char buffer[4];
								int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
								if (state == PS_SPECIAL) {
									characterDataHandler(buffer, len, false);
								} else {
									attributeValueString.append(buffer, len);
								}
							} else {
								specialString = "&" + specialString + ";";
								if (state == PS_SPECIAL) {
									characterDataHandler(specialString.c_str(), specialString.length(), false);
								} else {
									attributeValueString += specialString;
								}
							}
							specialString.erase();
							start = ptr + 1;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						} else if (!allowSymbol(state_special, *ptr)) {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					}
					break;
				case PS_TAGSTART:
					state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME;
					break;
				case PS_COMMENT:
					if (endOfComment[0] == '\0' && *ptr != '-') {
						state = PS_TAGNAME;
					} else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
						endOfComment[0] = '\0';
						endOfComment[1] = '\0';
					} else {
						endOfComment[0] = endOfComment[1];
						endOfComment[1] = *ptr;
					}
					break;
				case PS_WAIT_END_OF_TAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
				case PS_TAGNAME:
					if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						setTag(currentTag, currentString);
						currentString.erase();
						if (currentTag.Name == "") {
							state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
						} else {
							if (*ptr == '>') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_TEXT;
							} else if (*ptr == '/') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								currentTag.Start = false;
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_WAIT_END_OF_TAG;
							} else {
								state = PS_ATTRIBUTENAME;
							}
						}
					}
					break;
				case PS_ATTRIBUTENAME:
					if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							ZLStringUtil::asciiToLowerInline(currentString);
							currentTag.addAttribute(currentString);
							currentString.erase();
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_ATTRIBUTEVALUE:
					if (*ptr == '"') {
						if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
							++quotationCounter;
						}
					} else if (*ptr == '&') {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						appendString(attributeValueString, currentString);
						state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
						state_special = ST_UNKNOWN;
					} else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							appendString(attributeValueString, currentString);
							if (attributeValueString[0] == '"') {
								attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
							}
							currentTag.setLastAttributeValue(attributeValueString);
							attributeValueString.erase();
							quotationCounter = 0;
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_SKIPTAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
			}
		}
		if (start != endOfBuffer) {
			switch (state) {
				case PS_TEXT:
					if (!characterDataHandler(start, endOfBuffer - start, true)) {
						goto endOfProcessing;
					}
					break;
				case PS_TAGNAME:
				case PS_ATTRIBUTENAME:
				case PS_ATTRIBUTEVALUE:
					currentString.append(start, endOfBuffer - start);
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					specialString.append(start, endOfBuffer - start);
					break;
				case PS_TAGSTART:
				case PS_SKIPTAG:
				case PS_COMMENT:
				case PS_WAIT_END_OF_TAG:
					break;
			}
		}
		offset += length;
	} while (length == BUFSIZE);
endOfProcessing:
	delete[] buffer;

	endDocumentHandler();

	stream.close();
}
Пример #15
0
void HtmlReader::readDocument(ZLInputStream &stream) {
    std::cout<<"HtmlReader\n";
    if (!stream.open()) {
        return;
    }

    startDocumentHandler();

    ParseState state = PS_TEXT;
    SpecialType state_special = ST_UNKNOWN;
    std::string currentString;
    int quotationCounter = 0;
    HtmlTag currentTag;
    char endOfComment[2] = "\0";

    const size_t BUFSIZE = 2048;
    char *buffer = new char[BUFSIZE];
    size_t length;
    do {
        length = stream.read(buffer, BUFSIZE);
        char *start = buffer;
        char *endOfBuffer = buffer + length;
        for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
            switch (state) {
            case PS_TEXT:
                if (*ptr == '<') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_TAGSTART;
                }
                if (*ptr == '&') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_SPECIAL;
                    state_special = ST_UNKNOWN;
                }
                break;
            case PS_SPECIAL:
                if (state_special == ST_UNKNOWN) {
                    if (*ptr == '#') {
                        state_special = ST_NUM;
                    } else if (isalpha(*ptr)) {
                        state_special = ST_NAME;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else if (state_special == ST_NUM) {
                    if (*ptr == 'x') {
                        state_special = ST_HEX;
                    } else if (isdigit(*ptr)) {
                        state_special = ST_DEC;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else {
                    if (*ptr == ';') {
                        currentString.append(start, ptr - start);
                        int number = specialSymbolNumber(state_special, currentString);
                        if (number != 0) {
                            char buffer[4];
                            int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number);
                            characterDataHandler(buffer, len, false);
                        } else {
                            currentString = "&" + currentString + ";";
                            characterDataHandler(currentString.c_str(), currentString.length(), false);
                        }
                        currentString.erase();
                        start = ptr + 1;
                        state = PS_TEXT;
                    } else if (!allowSymbol(state_special, *ptr)) {
                        start = ptr;
                        state = PS_TEXT;
                    }
                }
                break;
            case PS_TAGSTART:
                state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
                break;
            case PS_COMMENT:
                if ((endOfComment[0] == '\0') && (*ptr != '-')) {
                    state = PS_TAGNAME;
                } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
                    start = ptr + 1;
                    state = PS_TEXT;
                    endOfComment[0] = '\0';
                    endOfComment[1] = '\0';
                } else {
                    endOfComment[0] = endOfComment[1];
                    endOfComment[1] = *ptr;
                }
                break;
            case PS_TAGNAME:
                if ((*ptr == '>') || isspace(*ptr)) {
                    currentString.append(start, ptr - start);
                    start = ptr + 1;
                    setTag(currentTag, currentString);
                    currentString.erase();
                    if (currentTag.Name == "") {
                        state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG;
                    } else {
                        if (*ptr == '>') {
                            if (!tagHandler(currentTag)) {
                                goto endOfProcessing;
                            }
                            state = PS_TEXT;
                        } else {
                            state = PS_ATTRIBUTENAME;
                        }
                    }
                }
                break;
            case PS_ATTRIBUTENAME:
                if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        for (unsigned int i = 0; i < currentString.length(); ++i) {
                            currentString[i] = toupper(currentString[i]);
                        }
                        currentTag.addAttribute(currentString);
                        currentString.erase();
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_ATTRIBUTEVALUE:
                if (*ptr == '"') {
                    if ((ptr == start) || (quotationCounter > 0)) {
                        ++quotationCounter;
                    }
                } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        if (currentString[0] == '"') {
                            currentString = currentString.substr(1, currentString.length() - 2);
                        }
                        currentTag.setLastAttributeValue(currentString);
                        currentString.erase();
                        quotationCounter = 0;
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_SKIPTAG:
                if (*ptr == '>') {
                    start = ptr + 1;
                    state = PS_TEXT;
                }
                break;
            }
        }
        if (start != endOfBuffer) {
            switch (state) {
            case PS_TEXT:
                if (!characterDataHandler(start, endOfBuffer - start, true)) {
                    goto endOfProcessing;
                }
                break;
            case PS_TAGNAME:
            case PS_ATTRIBUTENAME:
            case PS_ATTRIBUTEVALUE:
            case PS_SPECIAL:
                currentString.append(start, endOfBuffer - start);
                break;
            case PS_TAGSTART:
            case PS_SKIPTAG:
            case PS_COMMENT:
                break;
            }
        }
    } while (length == BUFSIZE);
endOfProcessing:
    delete[] buffer;

    endDocumentHandler();

    stream.close();
}
Пример #16
0
void PlainTextFormatDetector::detect(ZLInputStream &stream, PlainTextFormat &format) {
    if (!stream.open()) {
        return;
    }

    const unsigned int tableSize = 10;

    unsigned int lineCounter = 0;
    int emptyLineCounter = -1;
    unsigned int stringsWithLengthLessThan81Counter = 0;
    unsigned int stringIndentTable[tableSize] = { 0 };
    unsigned int emptyLinesTable[tableSize] = { 0 };
    unsigned int emptyLinesBeforeShortStringTable[tableSize] = { 0 };

    bool currentLineIsEmpty = true;
    unsigned int currentLineLength = 0;
    unsigned int currentLineIndent = 0;
    int currentNumberOfEmptyLines = -1;

    char *buffer = new char[BUFFER_SIZE];
    int length;
    char previous = 0;
    do {
        length = stream.read(buffer, BUFFER_SIZE);
        const char *end = buffer + length;
        for (const char *ptr = buffer; ptr != end; ++ptr) {
            ++currentLineLength;
            if (*ptr == '\n') {
                ++lineCounter;
                if (currentLineIsEmpty) {
                    ++emptyLineCounter;
                    ++currentNumberOfEmptyLines;
                } else {
                    if (currentNumberOfEmptyLines >= 0) {
                        int index = std::min(currentNumberOfEmptyLines, (int)tableSize - 1);
                        emptyLinesTable[index]++;
                        if (currentLineLength < 51) {
                            emptyLinesBeforeShortStringTable[index]++;
                        }
                    }
                    currentNumberOfEmptyLines = -1;
                }
                if (currentLineLength < 81) {
                    ++stringsWithLengthLessThan81Counter;
                }
                if (!currentLineIsEmpty) {
                    stringIndentTable[std::min(currentLineIndent, tableSize - 1)]++;
                }

                currentLineIsEmpty = true;
                currentLineLength = 0;
                currentLineIndent = 0;
            } else if (*ptr == '\r') {
                continue;
            } else if (isspace((unsigned char)*ptr)) {
                if (currentLineIsEmpty) {
                    ++currentLineIndent;
                }
            } else {
                currentLineIsEmpty = false;
            }
            previous = *ptr;
        }
    } while (length == BUFFER_SIZE);
    delete[] buffer;

    unsigned int nonEmptyLineCounter = lineCounter - emptyLineCounter;

    {
        unsigned int indent = 0;
        unsigned int lineWithIndent = 0;
        for (; indent < tableSize; ++indent) {
            lineWithIndent += stringIndentTable[indent];
            if (lineWithIndent > 0.1 * nonEmptyLineCounter) {
                break;
            }
        }
        format.IgnoredIndentOption.setValue(indent + 1);
    }

    {
        int breakType = 0;
        breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_EMPTY_LINE;
        breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_NEW_LINE;
        if (stringsWithLengthLessThan81Counter >= 0.5 * nonEmptyLineCounter) {
            breakType |= PlainTextFormat::BREAK_PARAGRAPH_AT_LINE_WITH_INDENT;
        }
        format.BreakTypeOption.setValue(breakType);
    }

    {
        unsigned int max = 0;
        unsigned index;
        int emptyLinesBeforeNewSection = -1;
        for (index = 2; index < tableSize; ++index) {
            if (max < emptyLinesBeforeShortStringTable[index]) {
                max = emptyLinesBeforeShortStringTable[index];
                emptyLinesBeforeNewSection = index;
            }
        }
        if (emptyLinesBeforeNewSection > 0) {
            for (index = tableSize - 1; index > 0; --index) {
                emptyLinesTable[index - 1] += emptyLinesTable[index];
                emptyLinesBeforeShortStringTable[index - 1] += emptyLinesBeforeShortStringTable[index];
            }
            for (index = emptyLinesBeforeNewSection; index < tableSize; ++index) {
                if ((emptyLinesBeforeShortStringTable[index] > 2) &&
                    (emptyLinesBeforeShortStringTable[index] > 0.7 * emptyLinesTable[index])) {
                        break;
                }
            }
            emptyLinesBeforeNewSection = (index == tableSize) ? -1 : (int)index;
        }
        format.EmptyLinesBeforeNewSectionOption.setValue(emptyLinesBeforeNewSection);
        format.CreateContentsTableOption.setValue(emptyLinesBeforeNewSection > 0);
    }

    format.InitializedOption.setValue(true);
}