void GranularityStrategyTest::parseText(const TextNodeVector& textNodes) { bool wordStarted = false; int wordStartIndex = 0; for (auto& text : textNodes) { int wordStartIndexOffset = m_letterPos.size(); String str = text->wholeText(); for (size_t i = 0; i < str.length(); i++) { m_letterPos.append(visiblePositionToContentsPoint(createVisiblePosition(Position(text, i)))); char c = str.characterAt(i); if (isASCIIAlphanumeric(c) && !wordStarted) { wordStartIndex = i + wordStartIndexOffset; wordStarted = true; } else if (!isASCIIAlphanumeric(c) && wordStarted) { IntPoint wordMiddle((m_letterPos[wordStartIndex].x() + m_letterPos[i + wordStartIndexOffset].x()) / 2, m_letterPos[wordStartIndex].y()); m_wordMiddles.append(wordMiddle); wordStarted = false; } } } if (wordStarted) { const auto& lastNode = textNodes.last(); int xEnd = visiblePositionToContentsPoint(createVisiblePosition(Position(lastNode, lastNode->wholeText().length()))).x(); IntPoint wordMiddle((m_letterPos[wordStartIndex].x() + xEnd) / 2, m_letterPos[wordStartIndex].y()); m_wordMiddles.append(wordMiddle); } }
static inline bool isValidXMLMIMETypeChar(UChar c) { // Valid characters per RFCs 3023 and 2045: // 0-9a-zA-Z_-+~!$^{}|.%'`#&* return isASCIIAlphanumeric(c) || c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' || c == '.' || c == '^' || c == '_' || c == '`' || c == '{' || c == '|' || c == '}' || c == '~'; }
static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc) { StringBuilder consumedCharacters; HTMLEntitySearch entitySearch; while (!source.isEmpty()) { cc = source.currentChar(); entitySearch.advance(cc); if (!entitySearch.isEntityPrefix()) break; consumedCharacters.append(cc); source.advance(); } notEnoughCharacters = source.isEmpty(); if (notEnoughCharacters) { // We can't an entity because there might be a longer entity // that we could match if we had more data. unconsumeCharacters(source, consumedCharacters); return false; } if (!entitySearch.mostRecentMatch()) { unconsumeCharacters(source, consumedCharacters); return false; } if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) { // We've consumed too many characters. We need to walk the // source back to the point at which we had consumed an // actual entity. unconsumeCharacters(source, consumedCharacters); consumedCharacters.clear(); const int length = entitySearch.mostRecentMatch()->length; const LChar* reference = entitySearch.mostRecentMatch()->entity; for (int i = 0; i < length; ++i) { cc = source.currentChar(); ASSERT_UNUSED(reference, cc == *reference++); consumedCharacters.append(cc); source.advance(); ASSERT(!source.isEmpty()); } cc = source.currentChar(); } if (entitySearch.mostRecentMatch()->lastCharacter() == ';' || !additionalAllowedCharacter || !(isASCIIAlphanumeric(cc) || cc == '=')) { decodedEntity.append(entitySearch.mostRecentMatch()->firstValue); if (entitySearch.mostRecentMatch()->secondValue) decodedEntity.append(entitySearch.mostRecentMatch()->secondValue); return true; } unconsumeCharacters(source, consumedCharacters); return false; }
// See: https://tools.ietf.org/html/bcp47#section-2.1 static bool isValidBCP47LanguageTag(const String& languageTag) { auto const length = languageTag.length(); // Max length picked as double the longest example tag in spec which is 49 characters: // https://tools.ietf.org/html/bcp47#section-4.4.2 if (length < 2 || length > 100) return false; UChar firstChar = languageTag[0]; if (!isASCIIAlpha(firstChar)) return false; UChar secondChar = languageTag[1]; if (length == 2) return isASCIIAlpha(secondChar); bool grandFatheredIrregularOrPrivateUse = (firstChar == 'i' || firstChar == 'x') && secondChar == '-'; unsigned nextCharIndexToCheck; if (!grandFatheredIrregularOrPrivateUse) { if (!isASCIIAlpha(secondChar)) return false; if (length == 3) return isASCIIAlpha(languageTag[2]); if (isASCIIAlpha(languageTag[2])) { if (languageTag[3] == '-') nextCharIndexToCheck = 4; else return false; } else if (languageTag[2] == '-') nextCharIndexToCheck = 3; else return false; } else nextCharIndexToCheck = 2; for (; nextCharIndexToCheck < length; ++nextCharIndexToCheck) { UChar c = languageTag[nextCharIndexToCheck]; if (isASCIIAlphanumeric(c) || c == '-') continue; return false; } return true; }
// Checks that |sessionId| looks correct and returns whether all checks pass. static bool isValidSessionId(const String& sessionId) { if ((sessionId.length() < MinSessionIdLength) || (sessionId.length() > MaxSessionIdLength)) return false; if (!sessionId.containsOnlyASCII()) return false; // Check that the sessionId only contains alphanumeric characters. for (unsigned i = 0; i < sessionId.length(); ++i) { if (!isASCIIAlphanumeric(sessionId[i])) return false; } return true; }
static bool isValidParameterNameChar(CharType chr) { // TODO(yoav): We need to move this function to a central location and possibly rewrite as a lookup table. https://crbug.com/527324 // Alpha-numeric is a valid char. // This is likely the common case - bailing early. if (isASCIIAlphanumeric(chr)) return true; // A separator or CTL or '%', '*' or '\'' means the char is not valid. // So any of: |{}[]/\:;<=>?@,()*'"% if (chr <= ' ' || chr > '|' || chr == '{' || chr == ']' || chr == '[' || chr == '/' || chr == '\\' || (chr <= '@' && chr >= ':') || chr == ',' || (chr >= '(' && chr <= '*') || chr == '\'' || chr == '"' || chr == '%') { return false; } return true; }
static inline bool shouldBreakAfter(UChar lastCh, UChar ch, UChar nextCh) { // Don't allow line breaking between '-' and a digit if the '-' may mean a minus sign in the context, // while allow breaking in 'ABCD-1234' and '1234-5678' which may be in long URLs. if (ch == '-' && isASCIIDigit(nextCh)) return isASCIIAlphanumeric(lastCh); // If both ch and nextCh are ASCII characters, use a lookup table for enhanced speed and for compatibility // with other browsers (see comments for asciiLineBreakTable for details). if (ch >= asciiLineBreakTableFirstChar && ch <= asciiLineBreakTableLastChar && nextCh >= asciiLineBreakTableFirstChar && nextCh <= asciiLineBreakTableLastChar) { const unsigned char* tableRow = asciiLineBreakTable[ch - asciiLineBreakTableFirstChar]; int nextChIndex = nextCh - asciiLineBreakTableFirstChar; return tableRow[nextChIndex / 8] & (1 << (nextChIndex % 8)); } // Otherwise defer to the Unicode algorithm by returning false. return false; }
// "ident" from the CSS tokenizer, minus backslash-escape sequences static bool isCSSTokenizerIdentifier(const String& string) { const UChar* p = string.characters(); const UChar* end = p + string.length(); // -? if (p != end && p[0] == '-') ++p; // {nmstart} if (p == end || !(p[0] == '_' || p[0] >= 128 || isASCIIAlpha(p[0]))) return false; ++p; // {nmchar}* for (; p != end; ++p) { if (!(p[0] == '_' || p[0] == '-' || p[0] >= 128 || isASCIIAlphanumeric(p[0]))) return false; } return true; }
template<typename CharType> inline bool isASCIIAlphanumericOrHyphen(CharType c) { return isASCIIAlphanumeric(c) || c == '-'; }
static bool isSchemeContinuationCharacter(UChar c) { return isASCIIAlphanumeric(c) || c == '+' || c == '-' || c == '.'; }
inline bool isASCIIAlphanumericOrHyphen(CharType c) { return isASCIIAlphanumeric(c) || c == '-'; }
static inline bool isSafeIdentifierCharacter(UChar c) { return isASCIIAlphanumeric(c) || (c == '_') || (c == '$'); }
static inline bool isIdentPart(int c) { return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c); }
Escape Parser::consumeEscape(bool inCharacterClass) { switch (peek()) { case EndOfPattern: setError(EscapeUnterminated); return Escape(Escape::Error); // Assertions case 'b': consume(); if (inCharacterClass) return PatternCharacterEscape('\b'); return WordBoundaryAssertionEscape(false); // do not invert case 'B': consume(); if (inCharacterClass) return PatternCharacterEscape('B'); return WordBoundaryAssertionEscape(true); // invert // CharacterClassEscape case 'd': consume(); return CharacterClassEscape(CharacterClass::digits(), false); case 's': consume(); return CharacterClassEscape(CharacterClass::spaces(), false); case 'w': consume(); return CharacterClassEscape(CharacterClass::wordchar(), false); case 'D': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nondigits(), false) : CharacterClassEscape(CharacterClass::digits(), true); case 'S': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nonspaces(), false) : CharacterClassEscape(CharacterClass::spaces(), true); case 'W': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nonwordchar(), false) : CharacterClassEscape(CharacterClass::wordchar(), true); // DecimalEscape case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { if (peekDigit() > m_numSubpatterns || inCharacterClass) { // To match Firefox, we parse an invalid backreference in the range [1-7] // as an octal escape. return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); } int value = 0; do { unsigned newValue = value * 10 + peekDigit(); if (newValue > m_numSubpatterns) break; value = newValue; consume(); } while (peekIsDigit()); return BackreferenceEscape(value); } // Octal escape case '0': consume(); return PatternCharacterEscape(consumeOctal()); // ControlEscape case 'f': consume(); return PatternCharacterEscape('\f'); case 'n': consume(); return PatternCharacterEscape('\n'); case 'r': consume(); return PatternCharacterEscape('\r'); case 't': consume(); return PatternCharacterEscape('\t'); case 'v': consume(); return PatternCharacterEscape('\v'); // ControlLetter case 'c': { SavedState state(*this); consume(); int control = consume(); // To match Firefox, inside a character class, we also accept numbers // and '_' as control characters. if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) { state.restore(); return PatternCharacterEscape('\\'); } return PatternCharacterEscape(control & 31); } // HexEscape case 'x': { consume(); SavedState state(*this); int x = consumeHex(2); if (x == -1) { state.restore(); return PatternCharacterEscape('x'); } return PatternCharacterEscape(x); } // UnicodeEscape case 'u': { consume(); SavedState state(*this); int x = consumeHex(4); if (x == -1) { state.restore(); return PatternCharacterEscape('u'); } return PatternCharacterEscape(x); } // IdentityEscape default: return PatternCharacterEscape(consume()); } }
// Only checks for general Base64 encoded chars, not '=' chars since '=' is // positional and may only appear at the end of a Base64 encoded string. bool isBase64EncodedCharacter(UChar c) { return isASCIIAlphanumeric(c) || c == '+' || c == '/'; }
bool isCSPDirectiveNameCharacter(UChar c) { return isASCIIAlphanumeric(c) || c == '-'; }
static inline bool isCSSTokenizerIdentifier(const CharacterType* characters, unsigned length) { const CharacterType* end = characters + length; // -? if (characters != end && characters[0] == '-') ++characters; // {nmstart} if (characters == end || !(characters[0] == '_' || characters[0] >= 128 || isASCIIAlpha(characters[0]))) return false; ++characters; // {nmchar}* for (; characters != end; ++characters) { if (!(characters[0] == '_' || characters[0] == '-' || characters[0] >= 128 || isASCIIAlphanumeric(characters[0]))) return false; } return true; }
static bool isBase64Character(UChar c) { return isASCIIAlphanumeric(c) || c == '+' || c == '/' || c == '-' || c == '_'; }
// FIXME: This should probably use common functions with ContentSecurityPolicy. static bool isIntegrityCharacter(UChar c) { // Check if it's a base64 encoded value. We're pretty loose here, as there's // not much risk in it, and it'll make it simpler for developers. return isASCIIAlphanumeric(c) || c == '_' || c == '-' || c == '+' || c == '/' || c == '='; }
static bool isHostCharacter(UChar c) { return isASCIIAlphanumeric(c) || c == '-'; }