void HtmlReader::readDocument(ZLInputStream &stream) { if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; std::string attributeValueString; std::string specialString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const std::size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; std::size_t length; std::size_t offset = 0; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; currentTag.Offset = offset + (ptr - buffer); } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (std::isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (std::isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } else { if (*ptr == ';') { specialString.append(start, ptr - start); const int number = specialSymbolNumber(state_special, specialString); if (128 <= number && number <= 159) { char ch = number; if (state == PS_SPECIAL) { characterDataHandler(&ch, 1, true); } else { myConverter->convert(attributeValueString, &ch, &ch + 1); } } else if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs4ToUtf8(buffer, number); if (state == PS_SPECIAL) { characterDataHandler(buffer, len, false); } else { attributeValueString.append(buffer, len); } } else { specialString = "&" + specialString + ";"; if (state == PS_SPECIAL) { characterDataHandler(specialString.c_str(), specialString.length(), false); } else { attributeValueString += specialString; } } specialString.erase(); start = ptr + 1; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = (state == PS_SPECIAL) ? PS_TEXT : PS_ATTRIBUTEVALUE; } } break; case PS_TAGSTART: state = *ptr == '!' ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if (endOfComment[0] == '\0' && *ptr != '-') { state = PS_TAGNAME; } else if (endOfComment[0] == '-' && endOfComment[1] == '-' && *ptr == '>') { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_WAIT_END_OF_TAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; case PS_TAGNAME: if (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = *ptr == '>' ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if (*ptr == '>' || *ptr == '/' || *ptr == '=' || std::isspace((unsigned char)*ptr)) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); ZLStringUtil::asciiToLowerInline(currentString); currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if (((ptr == start) && currentString.empty()) || (quotationCounter > 0)) { ++quotationCounter; } } else if (*ptr == '&') { currentString.append(start, ptr - start); start = ptr + 1; appendString(attributeValueString, currentString); state = PS_SPECIAL_IN_ATTRIBUTEVALUE; state_special = ST_UNKNOWN; } else if (quotationCounter != 1 && (*ptr == '>' || *ptr == '/' || std::isspace((unsigned char)*ptr))) { if (ptr != start || !currentString.empty()) { currentString.append(start, ptr - start); appendString(attributeValueString, currentString); if (attributeValueString[0] == '"') { attributeValueString = attributeValueString.substr(1, attributeValueString.length() - 2); } currentTag.setLastAttributeValue(attributeValueString); attributeValueString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else if (*ptr == '/') { if (!tagHandler(currentTag)) { goto endOfProcessing; } currentTag.Start = false; if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_WAIT_END_OF_TAG; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: currentString.append(start, endOfBuffer - start); break; case PS_SPECIAL: case PS_SPECIAL_IN_ATTRIBUTEVALUE: specialString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: case PS_WAIT_END_OF_TAG: break; } } offset += length; } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }
void HtmlReader::readDocument(ZLInputStream &stream) { std::cout<<"HtmlReader\n"; if (!stream.open()) { return; } startDocumentHandler(); ParseState state = PS_TEXT; SpecialType state_special = ST_UNKNOWN; std::string currentString; int quotationCounter = 0; HtmlTag currentTag; char endOfComment[2] = "\0"; const size_t BUFSIZE = 2048; char *buffer = new char[BUFSIZE]; size_t length; do { length = stream.read(buffer, BUFSIZE); char *start = buffer; char *endOfBuffer = buffer + length; for (char *ptr = buffer; ptr < endOfBuffer; ++ptr) { switch (state) { case PS_TEXT: if (*ptr == '<') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_TAGSTART; } if (*ptr == '&') { if (!characterDataHandler(start, ptr - start, true)) { goto endOfProcessing; } start = ptr + 1; state = PS_SPECIAL; state_special = ST_UNKNOWN; } break; case PS_SPECIAL: if (state_special == ST_UNKNOWN) { if (*ptr == '#') { state_special = ST_NUM; } else if (isalpha(*ptr)) { state_special = ST_NAME; } else { start = ptr; state = PS_TEXT; } } else if (state_special == ST_NUM) { if (*ptr == 'x') { state_special = ST_HEX; } else if (isdigit(*ptr)) { state_special = ST_DEC; } else { start = ptr; state = PS_TEXT; } } else { if (*ptr == ';') { currentString.append(start, ptr - start); int number = specialSymbolNumber(state_special, currentString); if (number != 0) { char buffer[4]; int len = ZLUnicodeUtil::ucs2ToUtf8(buffer, number); characterDataHandler(buffer, len, false); } else { currentString = "&" + currentString + ";"; characterDataHandler(currentString.c_str(), currentString.length(), false); } currentString.erase(); start = ptr + 1; state = PS_TEXT; } else if (!allowSymbol(state_special, *ptr)) { start = ptr; state = PS_TEXT; } } break; case PS_TAGSTART: state = (*ptr == '!') ? PS_COMMENT : PS_TAGNAME; break; case PS_COMMENT: if ((endOfComment[0] == '\0') && (*ptr != '-')) { state = PS_TAGNAME; } else if ((endOfComment[0] == '-') && (endOfComment[1] == '-') && (*ptr == '>')) { start = ptr + 1; state = PS_TEXT; endOfComment[0] = '\0'; endOfComment[1] = '\0'; } else { endOfComment[0] = endOfComment[1]; endOfComment[1] = *ptr; } break; case PS_TAGNAME: if ((*ptr == '>') || isspace(*ptr)) { currentString.append(start, ptr - start); start = ptr + 1; setTag(currentTag, currentString); currentString.erase(); if (currentTag.Name == "") { state = (*ptr == '>') ? PS_TEXT : PS_SKIPTAG; } else { if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } } break; case PS_ATTRIBUTENAME: if ((*ptr == '>') || (*ptr == '=') || isspace(*ptr)) { if (ptr != start) { currentString.append(start, ptr - start); for (unsigned int i = 0; i < currentString.length(); ++i) { currentString[i] = toupper(currentString[i]); } currentTag.addAttribute(currentString); currentString.erase(); } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = (*ptr == '=') ? PS_ATTRIBUTEVALUE : PS_ATTRIBUTENAME; } } break; case PS_ATTRIBUTEVALUE: if (*ptr == '"') { if ((ptr == start) || (quotationCounter > 0)) { ++quotationCounter; } } else if ((quotationCounter != 1) && ((*ptr == '>') || isspace(*ptr))) { if (ptr != start) { currentString.append(start, ptr - start); if (currentString[0] == '"') { currentString = currentString.substr(1, currentString.length() - 2); } currentTag.setLastAttributeValue(currentString); currentString.erase(); quotationCounter = 0; } start = ptr + 1; if (*ptr == '>') { if (!tagHandler(currentTag)) { goto endOfProcessing; } state = PS_TEXT; } else { state = PS_ATTRIBUTENAME; } } break; case PS_SKIPTAG: if (*ptr == '>') { start = ptr + 1; state = PS_TEXT; } break; } } if (start != endOfBuffer) { switch (state) { case PS_TEXT: if (!characterDataHandler(start, endOfBuffer - start, true)) { goto endOfProcessing; } break; case PS_TAGNAME: case PS_ATTRIBUTENAME: case PS_ATTRIBUTEVALUE: case PS_SPECIAL: currentString.append(start, endOfBuffer - start); break; case PS_TAGSTART: case PS_SKIPTAG: case PS_COMMENT: break; } } } while (length == BUFSIZE); endOfProcessing: delete[] buffer; endDocumentHandler(); stream.close(); }