Ejemplo n.º 1
0
bool HtmlBookReader::tagHandler(const HtmlTag &tag) {
	myConverter->reset();

	if (tag.Start) {
		shared_ptr<TagData> tagData = new TagData();
		tagData->addEntry(myStyleSheetTable.control(tag.Name, ""));
		const std::string *cls = tag.find("class");
		if (cls != 0) {
			tagData->addEntry(myStyleSheetTable.control("", *cls));
			tagData->addEntry(myStyleSheetTable.control(tag.Name, *cls));
		}
		myTagDataStack.push_back(tagData);
	} else if (!myTagDataStack.empty()) {
		for (int i = myTagDataStack.back()->StyleEntries.size(); i > 0; --i) {
			myBookReader.addStyleCloseEntry();
		}
		myTagDataStack.pop_back();
	}
	const std::string *id = tag.find("id");
	if (id != 0) {
		myBookReader.addHyperlinkLabel(*id);
	}
	shared_ptr<HtmlTagAction> action = myActionMap[tag.Name];
	if (action.isNull()) {
		action = createAction(tag.Name);
		myActionMap[tag.Name] = action;
	}
	action->run(tag);

	if (tag.Start) {
		for (std::vector<shared_ptr<TagData> >::const_iterator it = myTagDataStack.begin(); it != myTagDataStack.end(); ++it) {
			const unsigned char depth = it - myTagDataStack.begin() + 1;
			const std::vector<shared_ptr<ZLTextStyleEntry> > &entries = (*it)->StyleEntries;
			const bool inheritedOnly = it + 1 != myTagDataStack.end();
			for (std::vector<shared_ptr<ZLTextStyleEntry> >::const_iterator jt = entries.begin(); jt != entries.end(); ++jt) {
				shared_ptr<ZLTextStyleEntry> entry = inheritedOnly ? (*jt)->inherited() : *jt;
				myBookReader.addStyleEntry(*entry, depth);
			}
		}
	}

	return true;
}
Ejemplo n.º 2
0
void HtmlTagList::addElement(const HtmlTag &x)
{
    HtmlTag newTag = x; //make a copy
    if (m_htmlTags.empty() == false)     //there are already some elements?
    {
        const HtmlTag &last = m_htmlTags.back(); //last element
        if (last.isClosing())   //last one is closing one? (</tag>)
        {
            if (newTag.isClosing()) //new one is also closing one? - decrease it's depth-level
                newTag.setLevel(last.getLevel() - 1);
            else                    //else - use the same one
                newTag.setLevel(last.getLevel());
        }
        else //opening
        {
            if (newTag.isOpening()) //new one is also opening? - increase its depth-level
                newTag.setLevel(last.getLevel() + 1);
            else                    //else - use the same one
                newTag.setLevel(last.getLevel());
        }
    }
    m_htmlTags.push_back(newTag);
}
Ejemplo n.º 3
0
void HtmlReader::readDocument(ZLInputStream &stream) {
	if (!stream.open()) {
		return;
	}

	startDocumentHandler();

	ParseState state = PS_TEXT;
	SpecialType state_special = ST_UNKNOWN;
	std::string currentString;
	std::string attributeValueString;
	std::string specialString;
	int quotationCounter = 0;
	HtmlTag currentTag;
	char endOfComment[2] = "\0";

	const std::size_t BUFSIZE = 2048;
	char *buffer = new char[BUFSIZE];
	std::size_t length;
	std::size_t offset = 0;
	do {
		length = stream.read(buffer, BUFSIZE);
		char *start = buffer;
		char *endOfBuffer = buffer + length;
		for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
			switch (state) {
				case PS_TEXT:
					if (*ptr == '<') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_TAGSTART;
						currentTag.Offset = offset + (ptr - buffer);
					}
					if (*ptr == '&') {
						if (!characterDataHandler(start, ptr - start, true)) {
							goto endOfProcessing;
						}
						start = ptr + 1;
						state = PS_SPECIAL;
						state_special = ST_UNKNOWN;
					}
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					if (state_special == ST_UNKNOWN) {
						if (*ptr == '#') {
							state_special = ST_NUM;
						} else if (std::isalpha(*ptr)) {
							state_special = ST_NAME;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else if (state_special == ST_NUM) {
						if (*ptr == 'x') {
							state_special = ST_HEX;
						} else if (std::isdigit(*ptr)) {
							state_special = ST_DEC;
						} else {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					} else {
						if (*ptr == ';') {
							specialString.append(start, ptr - start);
							const int number = specialSymbolNumber(state_special, specialString);
							if (128 <= number && number <= 159) {
								char ch = number;
								if (state == PS_SPECIAL) {
									characterDataHandler(&ch, 1, true);
								} else {
									myConverter->convert(attributeValueString, &ch, &ch + 1);
								}
							} else if (number != 0) {
								char buffer[4];
								int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number);
								if (state == PS_SPECIAL) {
									characterDataHandler(buffer, len, false);
								} else {
									attributeValueString.append(buffer, len);
								}
							} else {
								specialString = "&" + specialString + ";";
								if (state == PS_SPECIAL) {
									characterDataHandler(specialString.c_str(), specialString.length(), false);
								} else {
									attributeValueString += specialString;
								}
							}
							specialString.erase();
							start = ptr + 1;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						} else if (!allowSymbol(state_special, *ptr)) {
							start = ptr;
							state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE;
						}
					}
					break;
				case PS_TAGSTART:
					state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME;
					break;
				case PS_COMMENT:
					if (endOfComment[0] == '\0' && *ptr != '-') {
						state = PS_TAGNAME;
					} else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
						endOfComment[0] = '\0';
						endOfComment[1] = '\0';
					} else {
						endOfComment[0] = endOfComment[1];
						endOfComment[1] = *ptr;
					}
					break;
				case PS_WAIT_END_OF_TAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
				case PS_TAGNAME:
					if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						setTag(currentTag, currentString);
						currentString.erase();
						if (currentTag.Name == "") {
							state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG;
						} else {
							if (*ptr == '>') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_TEXT;
							} else if (*ptr == '/') {
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								currentTag.Start = false;
								if (!tagHandler(currentTag)) {
									goto endOfProcessing;
								}
								state = PS_WAIT_END_OF_TAG;
							} else {
								state = PS_ATTRIBUTENAME;
							}
						}
					}
					break;
				case PS_ATTRIBUTENAME:
					if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							ZLStringUtil::asciiToLowerInline(currentString);
							currentTag.addAttribute(currentString);
							currentString.erase();
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_ATTRIBUTEVALUE:
					if (*ptr == '"') {
						if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) {
							++quotationCounter;
						}
					} else if (*ptr == '&') {
						currentString.append(start, ptr - start);
						start = ptr + 1;
						appendString(attributeValueString, currentString);
						state = PS_SPECIAL_IN_ATTRIBUTEVALUE;
						state_special = ST_UNKNOWN;
					} else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) {
						if (ptr != start || !currentString.empty()) {
							currentString.append(start, ptr - start);
							appendString(attributeValueString, currentString);
							if (attributeValueString[0] == '"') {
								attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2);
							}
							currentTag.setLastAttributeValue(attributeValueString);
							attributeValueString.erase();
							quotationCounter = 0;
						}
						start = ptr + 1;
						if (*ptr == '>') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_TEXT;
						} else if (*ptr == '/') {
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							currentTag.Start = false;
							if (!tagHandler(currentTag)) {
								goto endOfProcessing;
							}
							state = PS_WAIT_END_OF_TAG;
						} else {
							state = PS_ATTRIBUTENAME;
						}
					}
					break;
				case PS_SKIPTAG:
					if (*ptr == '>') {
						start = ptr + 1;
						state = PS_TEXT;
					}
					break;
			}
		}
		if (start != endOfBuffer) {
			switch (state) {
				case PS_TEXT:
					if (!characterDataHandler(start, endOfBuffer - start, true)) {
						goto endOfProcessing;
					}
					break;
				case PS_TAGNAME:
				case PS_ATTRIBUTENAME:
				case PS_ATTRIBUTEVALUE:
					currentString.append(start, endOfBuffer - start);
					break;
				case PS_SPECIAL:
				case PS_SPECIAL_IN_ATTRIBUTEVALUE:
					specialString.append(start, endOfBuffer - start);
					break;
				case PS_TAGSTART:
				case PS_SKIPTAG:
				case PS_COMMENT:
				case PS_WAIT_END_OF_TAG:
					break;
			}
		}
		offset += length;
	} while (length == BUFSIZE);
endOfProcessing:
	delete[] buffer;

	endDocumentHandler();

	stream.close();
}
Ejemplo n.º 4
0
void HtmlReader::readDocument(ZLInputStream &stream) {
    std::cout<<"HtmlReader\n";
    if (!stream.open()) {
        return;
    }

    startDocumentHandler();

    ParseState state = PS_TEXT;
    SpecialType state_special = ST_UNKNOWN;
    std::string currentString;
    int quotationCounter = 0;
    HtmlTag currentTag;
    char endOfComment[2] = "\0";

    const size_t BUFSIZE = 2048;
    char *buffer = new char[BUFSIZE];
    size_t length;
    do {
        length = stream.read(buffer, BUFSIZE);
        char *start = buffer;
        char *endOfBuffer = buffer + length;
        for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) {
            switch (state) {
            case PS_TEXT:
                if (*ptr == '<') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_TAGSTART;
                }
                if (*ptr == '&') {
                    if (!characterDataHandler(start, ptr - start, true)) {
                        goto endOfProcessing;
                    }
                    start = ptr + 1;
                    state = PS_SPECIAL;
                    state_special = ST_UNKNOWN;
                }
                break;
            case PS_SPECIAL:
                if (state_special == ST_UNKNOWN) {
                    if (*ptr == '#') {
                        state_special = ST_NUM;
                    } else if (isalpha(*ptr)) {
                        state_special = ST_NAME;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else if (state_special == ST_NUM) {
                    if (*ptr == 'x') {
                        state_special = ST_HEX;
                    } else if (isdigit(*ptr)) {
                        state_special = ST_DEC;
                    } else {
                        start = ptr;
                        state = PS_TEXT;
                    }
                } else {
                    if (*ptr == ';') {
                        currentString.append(start, ptr - start);
                        int number = specialSymbolNumber(state_special, currentString);
                        if (number != 0) {
                            char buffer[4];
                            int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number);
                            characterDataHandler(buffer, len, false);
                        } else {
                            currentString = "&" + currentString + ";";
                            characterDataHandler(currentString.c_str(), currentString.length(), false);
                        }
                        currentString.erase();
                        start = ptr + 1;
                        state = PS_TEXT;
                    } else if (!allowSymbol(state_special, *ptr)) {
                        start = ptr;
                        state = PS_TEXT;
                    }
                }
                break;
            case PS_TAGSTART:
                state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME;
                break;
            case PS_COMMENT:
                if ((endOfComment[0] == '\0') && (*ptr != '-')) {
                    state = PS_TAGNAME;
                } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) {
                    start = ptr + 1;
                    state = PS_TEXT;
                    endOfComment[0] = '\0';
                    endOfComment[1] = '\0';
                } else {
                    endOfComment[0] = endOfComment[1];
                    endOfComment[1] = *ptr;
                }
                break;
            case PS_TAGNAME:
                if ((*ptr == '>') || isspace(*ptr)) {
                    currentString.append(start, ptr - start);
                    start = ptr + 1;
                    setTag(currentTag, currentString);
                    currentString.erase();
                    if (currentTag.Name == "") {
                        state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG;
                    } else {
                        if (*ptr == '>') {
                            if (!tagHandler(currentTag)) {
                                goto endOfProcessing;
                            }
                            state = PS_TEXT;
                        } else {
                            state = PS_ATTRIBUTENAME;
                        }
                    }
                }
                break;
            case PS_ATTRIBUTENAME:
                if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        for (unsigned int i = 0; i < currentString.length(); ++i) {
                            currentString[i] = toupper(currentString[i]);
                        }
                        currentTag.addAttribute(currentString);
                        currentString.erase();
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_ATTRIBUTEVALUE:
                if (*ptr == '"') {
                    if ((ptr == start) || (quotationCounter > 0)) {
                        ++quotationCounter;
                    }
                } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) {
                    if (ptr != start) {
                        currentString.append(start, ptr - start);
                        if (currentString[0] == '"') {
                            currentString = currentString.substr(1, currentString.length() - 2);
                        }
                        currentTag.setLastAttributeValue(currentString);
                        currentString.erase();
                        quotationCounter = 0;
                    }
                    start = ptr + 1;
                    if (*ptr == '>') {
                        if (!tagHandler(currentTag)) {
                            goto endOfProcessing;
                        }
                        state = PS_TEXT;
                    } else {
                        state = PS_ATTRIBUTENAME;
                    }
                }
                break;
            case PS_SKIPTAG:
                if (*ptr == '>') {
                    start = ptr + 1;
                    state = PS_TEXT;
                }
                break;
            }
        }
        if (start != endOfBuffer) {
            switch (state) {
            case PS_TEXT:
                if (!characterDataHandler(start, endOfBuffer - start, true)) {
                    goto endOfProcessing;
                }
                break;
            case PS_TAGNAME:
            case PS_ATTRIBUTENAME:
            case PS_ATTRIBUTEVALUE:
            case PS_SPECIAL:
                currentString.append(start, endOfBuffer - start);
                break;
            case PS_TAGSTART:
            case PS_SKIPTAG:
            case PS_COMMENT:
                break;
            }
        }
    } while (length == BUFSIZE);
endOfProcessing:
    delete[] buffer;

    endDocumentHandler();

    stream.close();
}