inline static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) { StringBuilder consumedCharacters; HTMLEntitySearch entitySearch; while (!source.isEmpty()) { cc = source.currentChar(); entitySearch.advance(cc); if (!entitySearch.isEntityPrefix()) break; consumedCharacters.append(cc); source.advanceAndASSERT(cc); } notEnoughCharacters = source.isEmpty(); if (notEnoughCharacters) { // We can't an entity because there might be a longer entity // that we could match if we had more data. unconsumeCharacters(source, consumedCharacters); return false; } if (!entitySearch.mostRecentMatch()) { unconsumeCharacters(source, consumedCharacters); return false; } if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { // We've consumed too many characters. We need to walk the // source back to the point at which we had consumed an // actual entity. unconsumeCharacters(source, consumedCharacters); consumedCharacters.clear(); const int length = entitySearch.mostRecentMatch()->length; const LChar* reference = entitySearch.mostRecentMatch()->entity; for (int i = 0; i < length; ++i) { cc = source.currentChar(); ASSERT_UNUSED(reference, cc == *reference++); consumedCharacters.append(cc); source.advanceAndASSERT(cc); ASSERT(!source.isEmpty()); } cc = source.currentChar(); } if (entitySearch.mostRecentMatch()->lastCharacter() == ';' || !additionalAllowedCharacter || !(isAlphaNumeric(cc) || cc == '=')) { decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); if (entitySearch.mostRecentMatch()->secondValue) decodedEntity.append(entitySearch.mostRecentMatch()->secondValue); return true; } unconsumeCharacters(source, consumedCharacters); return false; }
bool consumeHTMLEntity(SegmentedString& source, Vector<UChar, 16>& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter) { ASSERT(!additionalAllowedCharacter || additionalAllowedCharacter == '"' || additionalAllowedCharacter == '\'' || additionalAllowedCharacter == '>'); ASSERT(!notEnoughCharacters); ASSERT(decodedEntity.isEmpty()); enum EntityState { Initial, Number, MaybeHexLowerCaseX, MaybeHexUpperCaseX, Hex, Decimal, Named }; EntityState entityState = Initial; UChar32 result = 0; Vector<UChar, 10> consumedCharacters; while (!source.isEmpty()) { UChar cc = *source; switch (entityState) { case Initial: { if (cc == '\x09' || cc == '\x0A' || cc == '\x0C' || cc == ' ' || cc == '<' || cc == '&') return false; if (additionalAllowedCharacter && cc == additionalAllowedCharacter) return false; if (cc == '#') { entityState = Number; break; } if ((cc >= 'a' && cc <= 'z') || (cc >= 'A' && cc <= 'Z')) { entityState = Named; continue; } return false; } case Number: { if (cc == 'x') { entityState = MaybeHexLowerCaseX; break; } if (cc == 'X') { entityState = MaybeHexUpperCaseX; break; } if (cc >= '0' && cc <= '9') { entityState = Decimal; continue; } source.push('#'); return false; } case MaybeHexLowerCaseX: { if (isHexDigit(cc)) { entityState = Hex; continue; } source.push('#'); source.push('x'); return false; } case MaybeHexUpperCaseX: { if (isHexDigit(cc)) { entityState = Hex; continue; } source.push('#'); source.push('X'); return false; } case Hex: { if (cc >= '0' && cc <= '9') result = result * 16 + cc - '0'; else if (cc >= 'a' && cc <= 'f') result = result * 16 + 10 + cc - 'a'; else if (cc >= 'A' && cc <= 'F') result = result * 16 + 10 + cc - 'A'; else { if (cc == ';') source.advanceAndASSERT(cc); return convertToUTF16(legalEntityFor(result), decodedEntity); } break; } case Decimal: { if (cc >= '0' && cc <= '9') result = result * 10 + cc - '0'; else { if (cc == ';') source.advanceAndASSERT(cc); return convertToUTF16(legalEntityFor(result), decodedEntity); } break; } case Named: { HTMLEntitySearch entitySearch; while (!source.isEmpty()) { cc = *source; entitySearch.advance(cc); if (!entitySearch.isEntityPrefix()) break; consumedCharacters.append(cc); source.advanceAndASSERT(cc); } notEnoughCharacters = source.isEmpty(); if (notEnoughCharacters) { // We can't an entity because there might be a longer entity // that we could match if we had more data. unconsumeCharacters(source, consumedCharacters); return false; } if (!entitySearch.mostRecentMatch()) { ASSERT(!entitySearch.currentValue()); unconsumeCharacters(source, consumedCharacters); return false; } if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { // We've consumed too many characters. We need to walk the // source back to the point at which we had consumed an // actual entity. unconsumeCharacters(source, consumedCharacters); consumedCharacters.clear(); const int length = entitySearch.mostRecentMatch()->length; const UChar* reference = entitySearch.mostRecentMatch()->entity; for (int i = 0; i < length; ++i) { cc = *source; ASSERT_UNUSED(reference, cc == *reference++); consumedCharacters.append(cc); source.advanceAndASSERT(cc); ASSERT(!source.isEmpty()); } cc = *source; } if (entitySearch.mostRecentMatch()->lastCharacter() == ';' || !additionalAllowedCharacter || !(isAlphaNumeric(cc) || cc == '=')) { return convertToUTF16(entitySearch.mostRecentMatch()->value, decodedEntity); } unconsumeCharacters(source, consumedCharacters); return false; } } consumedCharacters.append(cc); source.advanceAndASSERT(cc); } ASSERT(source.isEmpty()); notEnoughCharacters = true; unconsumeCharacters(source, consumedCharacters); return false; }