void XMLTreeBuilder::processHTMLEntity(const AtomicXMLToken& token) { HTMLEntitySearch search; const AtomicString& name = token.name(); for (size_t i = 0; i < name.length(); ++i) { search.advance(name[i]); if (!search.isEntityPrefix()) { m_parser->stopParsing(); return; } } search.advance(';'); if (!search.isEntityPrefix()) { m_parser->stopParsing(); return; } UChar32 entityValue = search.mostRecentMatch()->firstValue; // FIXME: We need to account for secondValue if any XML entities are longer // than one unicode character. ASSERT_NOT_REACHED(); // Darin Adler writes: // You can see given the code above that this else is dead code. This code is in a strange state. // And the reinterpret_cast to UChar* makes the code little-endian-specific. That is not good! if (entityValue <= 0xFFFF) appendToText(reinterpret_cast<UChar*>(&entityValue), 1); else { UChar utf16Pair[2] = { U16_LEAD(entityValue), U16_TRAIL(entityValue) }; appendToText(utf16Pair, 2); } }
size_t decodeNamedEntityToUCharArray(const char* name, UChar result[4]) { HTMLEntitySearch search; while (*name) { search.advance(*name++); if (!search.isEntityPrefix()) return 0; } search.advance(';'); if (!search.isEntityPrefix()) return 0; size_t numberOfCodePoints = appendUChar32ToUCharArray(search.mostRecentMatch()->firstValue, result); if (!search.mostRecentMatch()->secondValue) return numberOfCodePoints; return numberOfCodePoints + appendUChar32ToUCharArray(search.mostRecentMatch()->secondValue, result + numberOfCodePoints); }
static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) { StringBuilder consumedCharacters; HTMLEntitySearch entitySearch; while (!source.isEmpty()) { cc = source.currentChar(); entitySearch.advance(cc); if (!entitySearch.isEntityPrefix()) break; consumedCharacters.append(cc); source.advance(); } notEnoughCharacters = source.isEmpty(); if (notEnoughCharacters) { // We can't an entity because there might be a longer entity // that we could match if we had more data. unconsumeCharacters(source, consumedCharacters); return false; } if (!entitySearch.mostRecentMatch()) { unconsumeCharacters(source, consumedCharacters); return false; } if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { // We've consumed too many characters. We need to walk the // source back to the point at which we had consumed an // actual entity. unconsumeCharacters(source, consumedCharacters); consumedCharacters.clear(); const int length = entitySearch.mostRecentMatch()->length; const LChar* reference = entitySearch.mostRecentMatch()->entity; for (int i = 0; i < length; ++i) { cc = source.currentChar(); ASSERT_UNUSED(reference, cc == *reference++); consumedCharacters.append(cc); source.advance(); ASSERT(!source.isEmpty()); } cc = source.currentChar(); } if (entitySearch.mostRecentMatch()->lastCharacter() == ';' || !additionalAllowedCharacter || !(isASCIIAlphanumeric(cc) || cc == '=')) { decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); if (entitySearch.mostRecentMatch()->secondValue) decodedEntity.append(entitySearch.mostRecentMatch()->secondValue); return true; } unconsumeCharacters(source, consumedCharacters); return false; }
bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) { ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); ASSERT(!notEnoughCharacters); ASSERT(decodedEntity.isEmpty()); enum EntityState { Initial, Number, MaybeHexLowerCaseX, MaybeHexUpperCaseX, Hex, Decimal, Named }; EntityState entityState = Initial; UChar32 result = 0; Vector<UChar, 10> consumedCharacters; while (!source.isEmpty()) { UChar cc = *source; switch (entityState) { case Initial: { if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') return false; if (additionalAllowedCharacter && cc == additionalAllowedCharacter) return false; if (cc == '#') { entityState = Number; break; } if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { entityState = Named; continue; } return false; } case Number: { if (cc == 'x') { entityState = MaybeHexLowerCaseX; break; } if (cc == 'X') { entityState = MaybeHexUpperCaseX; break; } if (cc >= '0' && cc <= '9') { entityState = Decimal; continue; } source.push('#'); return false; } case MaybeHexLowerCaseX: { if (isHexDigit(cc)) { entityState = Hex; continue; } source.push('#'); source.push('x'); return false; } case MaybeHexUpperCaseX: { if (isHexDigit(cc)) { entityState = Hex; continue; } source.push('#'); source.push('X'); return false; } case Hex: { if (cc >= '0' && cc <= '9') result = result * 16 + cc - '0'; else if (cc >= 'a' && cc <= 'f') result = result * 16 + 10 + cc - 'a'; else if (cc >= 'A' && cc <= 'F') result = result * 16 + 10 + cc - 'A'; else { if (cc == ';') source.advanceAndASSERT(cc); return convertToUTF16(legalEntityFor(result), decodedEntity); } break; } case Decimal: { if (cc >= '0' && cc <= '9') result = result * 10 + cc - '0'; else { if (cc == ';') source.advanceAndASSERT(cc); return convertToUTF16(legalEntityFor(result), decodedEntity); } break; } case Named: { HTMLEntitySearch entitySearch; while (!source.isEmpty()) { cc = *source; entitySearch.advance(cc); if (!entitySearch.isEntityPrefix()) break; consumedCharacters.append(cc); source.advanceAndASSERT(cc); } notEnoughCharacters = source.isEmpty(); if (notEnoughCharacters) { // We can't an entity because there might be a longer entity // that we could match if we had more data. unconsumeCharacters(source, consumedCharacters); return false; } if (!entitySearch.mostRecentMatch()) { ASSERT(!entitySearch.currentValue()); unconsumeCharacters(source, consumedCharacters); return false; } if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { // We've consumed too many characters. We need to walk the // source back to the point at which we had consumed an // actual entity. unconsumeCharacters(source, consumedCharacters); consumedCharacters.clear(); const int length = entitySearch.mostRecentMatch()->length; const UChar* reference = entitySearch.mostRecentMatch()->entity; for (int i = 0; i < length; ++i) { cc = *source; ASSERT_UNUSED(reference, cc == *reference++); consumedCharacters.append(cc); source.advanceAndASSERT(cc); ASSERT(!source.isEmpty()); } cc = *source; } if (entitySearch.mostRecentMatch()->lastCharacter() == ';' || !additionalAllowedCharacter || !(isAlphaNumeric(cc) || cc == '=')) { return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); } unconsumeCharacters(source, consumedCharacters); return false; } } consumedCharacters.append(cc); source.advanceAndASSERT(cc); } ASSERT(source.isEmpty()); notEnoughCharacters = true; unconsumeCharacters(source, consumedCharacters); return false; }