void GranularityStrategyTest::parseText(const TextNodeVector& textNodes)
{
    bool wordStarted = false;
    int wordStartIndex = 0;
    for (auto& text : textNodes) {
        int wordStartIndexOffset = m_letterPos.size();
        String str = text->wholeText();
        for (size_t i = 0; i < str.length(); i++) {
            m_letterPos.append(visiblePositionToContentsPoint(createVisiblePosition(Position(text, i))));
            char c = str.characterAt(i);
            if (isASCIIAlphanumeric(c) && !wordStarted) {
                wordStartIndex = i + wordStartIndexOffset;
                wordStarted = true;
            } else if (!isASCIIAlphanumeric(c) && wordStarted) {
                IntPoint wordMiddle((m_letterPos[wordStartIndex].x() + m_letterPos[i + wordStartIndexOffset].x()) / 2, m_letterPos[wordStartIndex].y());
                m_wordMiddles.append(wordMiddle);
                wordStarted = false;
            }
        }
    }
    if (wordStarted) {
        const auto& lastNode = textNodes.last();
        int xEnd = visiblePositionToContentsPoint(createVisiblePosition(Position(lastNode, lastNode->wholeText().length()))).x();
        IntPoint wordMiddle((m_letterPos[wordStartIndex].x() + xEnd) / 2, m_letterPos[wordStartIndex].y());
        m_wordMiddles.append(wordMiddle);
    }
}
static inline bool isValidXMLMIMETypeChar(UChar c)
{
    // Valid characters per RFCs 3023 and 2045:
    // 0-9a-zA-Z_-+~!$^{}|.%'`#&*
    return isASCIIAlphanumeric(c) || c == '!' || c == '#' || c == '$' || c == '%' || c == '&' || c == '\'' || c == '*' || c == '+'
        || c == '-' || c == '.' || c == '^' || c == '_' || c == '`' || c == '{' || c == '|' || c == '}' || c == '~';
}
Exemple #3
0
 static bool consumeNamedEntity(SegmentedString& source, StringBuilder& decodedEntity, bool& notEnoughCharacters, UChar additionalAllowedCharacter, UChar& cc)
 {
     StringBuilder consumedCharacters;
     HTMLEntitySearch entitySearch;
     while (!source.isEmpty()) {
         cc = source.currentChar();
         entitySearch.advance(cc);
         if (!entitySearch.isEntityPrefix())
             break;
         consumedCharacters.append(cc);
         source.advance();
     }
     notEnoughCharacters = source.isEmpty();
     if (notEnoughCharacters) {
         // We can't an entity because there might be a longer entity
         // that we could match if we had more data.
         unconsumeCharacters(source, consumedCharacters);
         return false;
     }
     if (!entitySearch.mostRecentMatch()) {
         unconsumeCharacters(source, consumedCharacters);
         return false;
     }
     if (entitySearch.mostRecentMatch()->length != entitySearch.currentLength()) {
         // We've consumed too many characters. We need to walk the
         // source back to the point at which we had consumed an
         // actual entity.
         unconsumeCharacters(source, consumedCharacters);
         consumedCharacters.clear();
         const int length = entitySearch.mostRecentMatch()->length;
         const LChar* reference = entitySearch.mostRecentMatch()->entity;
         for (int i = 0; i < length; ++i) {
             cc = source.currentChar();
             ASSERT_UNUSED(reference, cc == *reference++);
             consumedCharacters.append(cc);
             source.advance();
             ASSERT(!source.isEmpty());
         }
         cc = source.currentChar();
     }
     if (entitySearch.mostRecentMatch()->lastCharacter() == ';'
         || !additionalAllowedCharacter
         || !(isASCIIAlphanumeric(cc) || cc == '=')) {
         decodedEntity.append(entitySearch.mostRecentMatch()->firstValue);
         if (entitySearch.mostRecentMatch()->secondValue)
             decodedEntity.append(entitySearch.mostRecentMatch()->secondValue);
         return true;
     }
     unconsumeCharacters(source, consumedCharacters);
     return false;
 }
Exemple #4
0
// See: https://tools.ietf.org/html/bcp47#section-2.1
static bool isValidBCP47LanguageTag(const String& languageTag)
{
    auto const length = languageTag.length();

    // Max length picked as double the longest example tag in spec which is 49 characters:
    // https://tools.ietf.org/html/bcp47#section-4.4.2
    if (length < 2 || length > 100)
        return false;

    UChar firstChar = languageTag[0];

    if (!isASCIIAlpha(firstChar))
        return false;

    UChar secondChar = languageTag[1];

    if (length == 2)
        return isASCIIAlpha(secondChar);

    bool grandFatheredIrregularOrPrivateUse = (firstChar == 'i' || firstChar == 'x') && secondChar == '-';
    unsigned nextCharIndexToCheck;

    if (!grandFatheredIrregularOrPrivateUse) {
        if (!isASCIIAlpha(secondChar))
            return false;

        if (length == 3)
            return isASCIIAlpha(languageTag[2]);

        if (isASCIIAlpha(languageTag[2])) {
            if (languageTag[3] == '-')
                nextCharIndexToCheck = 4;
            else
                return false;
        } else if (languageTag[2] == '-')
            nextCharIndexToCheck = 3;
        else
            return false;
    } else
        nextCharIndexToCheck = 2;

    for (; nextCharIndexToCheck < length; ++nextCharIndexToCheck) {
        UChar c = languageTag[nextCharIndexToCheck];
        if (isASCIIAlphanumeric(c) || c == '-')
            continue;
        return false;
    }
    return true;
}
// Checks that |sessionId| looks correct and returns whether all checks pass.
static bool isValidSessionId(const String& sessionId)
{
    if ((sessionId.length() < MinSessionIdLength) || (sessionId.length() > MaxSessionIdLength))
        return false;

    if (!sessionId.containsOnlyASCII())
        return false;

    // Check that the sessionId only contains alphanumeric characters.
    for (unsigned i = 0; i < sessionId.length(); ++i) {
        if (!isASCIIAlphanumeric(sessionId[i]))
            return false;
    }

    return true;
}
static bool isValidParameterNameChar(CharType chr)
{
    // TODO(yoav): We need to move this function to a central location and possibly rewrite as a lookup table. https://crbug.com/527324

    // Alpha-numeric is a valid char.
    // This is likely the common case - bailing early.
    if (isASCIIAlphanumeric(chr))
        return true;
    // A separator or CTL or '%', '*' or '\'' means the char is not valid.
    // So any of: |{}[]/\:;<=>?@,()*'"%
    if (chr <= ' ' || chr > '|' || chr == '{' || chr == ']' || chr == '['
        || chr == '/' || chr == '\\' || (chr <= '@' && chr >= ':') || chr == ','
        || (chr >= '(' && chr <= '*') || chr == '\'' || chr == '"' || chr == '%') {
        return false;
    }
    return true;
}
Exemple #7
0
static inline bool shouldBreakAfter(UChar lastCh, UChar ch, UChar nextCh)
{
    // Don't allow line breaking between '-' and a digit if the '-' may mean a minus sign in the context,
    // while allow breaking in 'ABCD-1234' and '1234-5678' which may be in long URLs.
    if (ch == '-' && isASCIIDigit(nextCh))
        return isASCIIAlphanumeric(lastCh);

    // If both ch and nextCh are ASCII characters, use a lookup table for enhanced speed and for compatibility
    // with other browsers (see comments for asciiLineBreakTable for details).
    if (ch >= asciiLineBreakTableFirstChar && ch <= asciiLineBreakTableLastChar
        && nextCh >= asciiLineBreakTableFirstChar && nextCh <= asciiLineBreakTableLastChar) {
        const unsigned char* tableRow = asciiLineBreakTable[ch - asciiLineBreakTableFirstChar];
        int nextChIndex = nextCh - asciiLineBreakTableFirstChar;
        return tableRow[nextChIndex / 8] & (1 << (nextChIndex % 8));
    }
    // Otherwise defer to the Unicode algorithm by returning false.
    return false;
}
// "ident" from the CSS tokenizer, minus backslash-escape sequences
static bool isCSSTokenizerIdentifier(const String& string)
{
    const UChar* p = string.characters();
    const UChar* end = p + string.length();

    // -?
    if (p != end && p[0] == '-')
        ++p;

    // {nmstart}
    if (p == end || !(p[0] == '_' || p[0] >= 128 || isASCIIAlpha(p[0])))
        return false;
    ++p;

    // {nmchar}*
    for (; p != end; ++p) {
        if (!(p[0] == '_' || p[0] == '-' || p[0] >= 128 || isASCIIAlphanumeric(p[0])))
            return false;
    }

    return true;
}
template<typename CharType> inline bool isASCIIAlphanumericOrHyphen(CharType c)
{
    return isASCIIAlphanumeric(c) || c == '-';
}
static bool isSchemeContinuationCharacter(UChar c)
{
    return isASCIIAlphanumeric(c) || c == '+' || c == '-' || c == '.';
}
Exemple #11
0
inline bool isASCIIAlphanumericOrHyphen(CharType c) {
  return isASCIIAlphanumeric(c) || c == '-';
}
Exemple #12
0
static inline bool isSafeIdentifierCharacter(UChar c)
{
    return isASCIIAlphanumeric(c) || (c == '_') || (c == '$');
}
Exemple #13
0
static inline bool isIdentPart(int c)
{
    return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c);
}
Escape Parser::consumeEscape(bool inCharacterClass)
{
    switch (peek()) {
    case EndOfPattern:
        setError(EscapeUnterminated);
        return Escape(Escape::Error);

    // Assertions
    case 'b':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('\b');
        return WordBoundaryAssertionEscape(false); // do not invert
    case 'B':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('B');
        return WordBoundaryAssertionEscape(true); // invert

    // CharacterClassEscape
    case 'd':
        consume();
        return CharacterClassEscape(CharacterClass::digits(), false);
    case 's':
        consume();
        return CharacterClassEscape(CharacterClass::spaces(), false);
    case 'w':
        consume();
        return CharacterClassEscape(CharacterClass::wordchar(), false);
    case 'D':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nondigits(), false)
            : CharacterClassEscape(CharacterClass::digits(), true);
    case 'S':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonspaces(), false)
            : CharacterClassEscape(CharacterClass::spaces(), true);
    case 'W':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
            : CharacterClassEscape(CharacterClass::wordchar(), true);

    // DecimalEscape
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9': {
        if (peekDigit() > m_numSubpatterns || inCharacterClass) {
            // To match Firefox, we parse an invalid backreference in the range [1-7]
            // as an octal escape.
            return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
        }

        int value = 0;
        do {
            unsigned newValue = value * 10 + peekDigit();
            if (newValue > m_numSubpatterns)
                break;
            value = newValue;
            consume();
        } while (peekIsDigit());

        return BackreferenceEscape(value);
    }

    // Octal escape
    case '0':
        consume();
        return PatternCharacterEscape(consumeOctal());

    // ControlEscape
    case 'f':
        consume();
        return PatternCharacterEscape('\f');
    case 'n':
        consume();
        return PatternCharacterEscape('\n');
    case 'r':
        consume();
        return PatternCharacterEscape('\r');
    case 't':
        consume();
        return PatternCharacterEscape('\t');
    case 'v':
        consume();
        return PatternCharacterEscape('\v');

    // ControlLetter
    case 'c': {
        SavedState state(*this);
        consume();
        
        int control = consume();
        // To match Firefox, inside a character class, we also accept numbers
        // and '_' as control characters.
        if ((!inCharacterClass && !isASCIIAlpha(control)) || (!isASCIIAlphanumeric(control) && control != '_')) {
            state.restore();
            return PatternCharacterEscape('\\');
        }
        return PatternCharacterEscape(control & 31);
    }

    // HexEscape
    case 'x': {
        consume();

        SavedState state(*this);
        int x = consumeHex(2);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('x');
        }
        return PatternCharacterEscape(x);
    }

    // UnicodeEscape
    case 'u': {
        consume();

        SavedState state(*this);
        int x = consumeHex(4);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('u');
        }
        return PatternCharacterEscape(x);
    }

    // IdentityEscape
    default:
        return PatternCharacterEscape(consume());
    }
}
// Only checks for general Base64 encoded chars, not '=' chars since '=' is
// positional and may only appear at the end of a Base64 encoded string.
bool isBase64EncodedCharacter(UChar c)
{
    return isASCIIAlphanumeric(c) || c == '+' || c == '/';
}
bool isCSPDirectiveNameCharacter(UChar c)
{
    return isASCIIAlphanumeric(c) || c == '-';
}
Exemple #17
0
static inline bool isCSSTokenizerIdentifier(const CharacterType* characters, unsigned length)
{
    const CharacterType* end = characters + length;

    // -?
    if (characters != end && characters[0] == '-')
        ++characters;

    // {nmstart}
    if (characters == end || !(characters[0] == '_' || characters[0] >= 128 || isASCIIAlpha(characters[0])))
        return false;
    ++characters;

    // {nmchar}*
    for (; characters != end; ++characters) {
        if (!(characters[0] == '_' || characters[0] == '-' || characters[0] >= 128 || isASCIIAlphanumeric(characters[0])))
            return false;
    }

    return true;
}
static bool isBase64Character(UChar c)
{
    return isASCIIAlphanumeric(c) || c == '+' || c == '/' || c == '-' || c == '_';
}
// FIXME: This should probably use common functions with ContentSecurityPolicy.
static bool isIntegrityCharacter(UChar c)
{
    // Check if it's a base64 encoded value. We're pretty loose here, as there's
    // not much risk in it, and it'll make it simpler for developers.
    return isASCIIAlphanumeric(c) || c == '_' || c == '-' || c == '+' || c == '/' || c == '=';
}
static bool isHostCharacter(UChar c)
{
    return isASCIIAlphanumeric(c) || c == '-';
}