Example #1
0
// See: https://tools.ietf.org/html/bcp47#section-2.1
static bool isValidBCP47LanguageTag(const String& languageTag)
{
    auto const length = languageTag.length();

    // Max length picked as double the longest example tag in spec which is 49 characters:
    // https://tools.ietf.org/html/bcp47#section-4.4.2
    if (length < 2 || length > 100)
        return false;

    UChar firstChar = languageTag[0];

    if (!isASCIIAlpha(firstChar))
        return false;

    UChar secondChar = languageTag[1];

    if (length == 2)
        return isASCIIAlpha(secondChar);

    bool grandFatheredIrregularOrPrivateUse = (firstChar == 'i' || firstChar == 'x') && secondChar == '-';
    unsigned nextCharIndexToCheck;

    if (!grandFatheredIrregularOrPrivateUse) {
        if (!isASCIIAlpha(secondChar))
            return false;

        if (length == 3)
            return isASCIIAlpha(languageTag[2]);

        if (isASCIIAlpha(languageTag[2])) {
            if (languageTag[3] == '-')
                nextCharIndexToCheck = 4;
            else
                return false;
        } else if (languageTag[2] == '-')
            nextCharIndexToCheck = 3;
        else
            return false;
    } else
        nextCharIndexToCheck = 2;

    for (; nextCharIndexToCheck < length; ++nextCharIndexToCheck) {
        UChar c = languageTag[nextCharIndexToCheck];
        if (isASCIIAlphanumeric(c) || c == '-')
            continue;
        return false;
    }
    return true;
}
Example #2
0
// This function converts Windows date/time pattern format [1][2] into LDML date
// format pattern [3].
//
// i.e.
//   We set h, H, m, s, d, dd, M, or y as is. They have same meaning in both of
//   Windows and LDML.
//   We need to convert the following patterns:
//     t -> a
//     tt -> a
//     ddd -> EEE
//     dddd -> EEEE
//     g -> G
//     gg -> ignore
//
// [1] http://msdn.microsoft.com/en-us/library/dd317787(v=vs.85).aspx
// [2] http://msdn.microsoft.com/en-us/library/dd318148(v=vs.85).aspx
// [3] LDML http://unicode.org/reports/tr35/tr35-6.html#Date_Format_Patterns
static String convertWindowsDateTimeFormat(const String& format)
{
    StringBuilder converted;
    StringBuilder literalBuffer;
    bool inQuote = false;
    bool lastQuoteCanBeLiteral = false;
    for (unsigned i = 0; i < format.length(); ++i) {
        UChar ch = format[i];
        if (inQuote) {
            if (ch == '\'') {
                inQuote = false;
                ASSERT(i);
                if (lastQuoteCanBeLiteral && format[i - 1] == '\'') {
                    literalBuffer.append('\'');
                    lastQuoteCanBeLiteral = false;
                } else
                    lastQuoteCanBeLiteral = true;
            } else
                literalBuffer.append(ch);
            continue;
        }

        if (ch == '\'') {
            inQuote = true;
            if (lastQuoteCanBeLiteral && i > 0 && format[i - 1] == '\'') {
                literalBuffer.append(ch);
                lastQuoteCanBeLiteral = false;
            } else
                lastQuoteCanBeLiteral = true;
        } else if (isASCIIAlpha(ch)) {
            commitLiteralToken(literalBuffer, converted);
            unsigned symbolStart = i;
            unsigned count = countContinuousLetters(format, i);
            i += count - 1;
            if (ch == 'h' || ch == 'H' || ch == 'm' || ch == 's' || ch == 'M' || ch == 'y')
                converted.append(format, symbolStart, count);
            else if (ch == 'd') {
                if (count <= 2)
                    converted.append(format, symbolStart, count);
                else if (count == 3)
                    converted.append("EEE");
                else
                    converted.append("EEEE");
            } else if (ch == 'g') {
                if (count == 1)
                    converted.append('G');
                else {
                    // gg means imperial era in Windows.
                    // Just ignore it.
                }
            } else if (ch == 't')
                converted.append('a');
            else
                literalBuffer.append(format, symbolStart, count);
        } else
            literalBuffer.append(ch);
    }
    commitLiteralToken(literalBuffer, converted);
    return converted.toString();
}
// http://dev.w3.org/csswg/css-syntax/#name-start-code-point
static bool isNameStart(UChar c)
{
    if (isASCIIAlpha(c))
        return true;
    if (c == '_')
        return true;
    return !isASCII(c);
}
static bool isCharacterAllowedInBase(UChar c, int base)
{
    if (c > 0x7F)
        return false;
    if (isASCIIDigit(c))
        return c - '0' < base;
    if (isASCIIAlpha(c)) {
        if (base > 36)
            base = 36;
        return (c >= 'a' && c < 'a' + base - 10)
            || (c >= 'A' && c < 'A' + base - 10);
    }
    return false;
}
TEST(MediaQueryTokenizerCodepointsTest, Basic)
{
    for (UChar c = 0; c <= 1000; ++c) {
        if (isASCIIDigit(c))
            testToken(c, NumberToken);
        else if (isASCIIAlpha(c))
            testToken(c, IdentToken);
        else if (c == '_')
            testToken(c, IdentToken);
        else if (c == '\r' || c == ' ' || c == '\n' || c == '\t' || c == '\f')
            testToken(c, WhitespaceToken);
        else if (c == '(')
            testToken(c, LeftParenthesisToken);
        else if (c == ')')
            testToken(c, RightParenthesisToken);
        else if (c == '[')
            testToken(c, LeftBracketToken);
        else if (c == ']')
            testToken(c, RightBracketToken);
        else if (c == '{')
            testToken(c, LeftBraceToken);
        else if (c == '}')
            testToken(c, RightBraceToken);
        else if (c == '.' || c == '+' || c == '-' || c == '/' || c == '\\')
            testToken(c, DelimiterToken);
        else if (c == '\'' || c == '"')
            testToken(c, StringToken);
        else if (c == ',')
            testToken(c, CommaToken);
        else if (c == ':')
            testToken(c, ColonToken);
        else if (c == ';')
            testToken(c, SemicolonToken);
        else if (!c)
            testToken(c, EOFToken);
        else if (c > SCHAR_MAX)
            testToken(c, IdentToken);
        else
            testToken(c, DelimiterToken);
    }
    testToken(USHRT_MAX, IdentToken);
}
Example #6
0
static inline bool isCSSTokenizerIdentifier(const CharacterType* characters, unsigned length)
{
    const CharacterType* end = characters + length;

    // -?
    if (characters != end && characters[0] == '-')
        ++characters;

    // {nmstart}
    if (characters == end || !(characters[0] == '_' || characters[0] >= 128 || isASCIIAlpha(characters[0])))
        return false;
    ++characters;

    // {nmchar}*
    for (; characters != end; ++characters) {
        if (!(characters[0] == '_' || characters[0] == '-' || characters[0] >= 128 || isASCIIAlphanumeric(characters[0])))
            return false;
    }

    return true;
}
Example #7
0
// "ident" from the CSS tokenizer, minus backslash-escape sequences
static bool isCSSTokenizerIdentifier(const String& string)
{
    const UChar* p = string.characters();
    const UChar* end = p + string.length();

    // -?
    if (p != end && p[0] == '-')
        ++p;

    // {nmstart}
    if (p == end || !(p[0] == '_' || p[0] >= 128 || isASCIIAlpha(p[0])))
        return false;
    ++p;

    // {nmchar}*
    for (; p != end; ++p) {
        if (!(p[0] == '_' || p[0] == '-' || p[0] >= 128 || isASCIIAlphanumeric(p[0])))
            return false;
    }

    return true;
}
Example #8
0
// Specification of the input:
// http://icu-project.org/apiref/icu4c/classSimpleDateFormat.html#details
static String localizeFormat(const Vector<UChar>& buffer)
{
    StringBuilder builder;
    UChar lastChar = 0;
    bool inQuote = false;
    for (unsigned i = 0; i < buffer.size(); ++i) {
        if (inQuote) {
            if (buffer[i] == '\'') {
                inQuote = false;
                lastChar = 0;
                ASSERT(i);
                if (buffer[i - 1] == '\'')
                    builder.append('\'');
            } else
                builder.append(buffer[i]);
        } else {
            if (isASCIIAlpha(lastChar) && lastChar == buffer[i])
                continue;
            lastChar = buffer[i];
            if (isICUYearSymbol(lastChar)) {
                String text = dateFormatYearText();
                builder.append(text.isEmpty() ? "Year" : text);
            } else if (isICUMonthSymbol(lastChar)) {
                String text = dateFormatMonthText();
                builder.append(text.isEmpty() ? "Month" : text);
            } else if (isICUDayInMonthSymbol(lastChar)) {
                String text = dateFormatDayInMonthText();
                builder.append(text.isEmpty() ? "Day" : text);
            } else if (lastChar == '\'')
                inQuote = true;
            else
                builder.append(lastChar);
        }
    }
    return builder.toString();
}
void FormatPrinter::print(const char* format, va_list args)
{
    const char* p = format;
    const char* errorStr;

    // buffer is only used for 2 purposes:
    // 1. To temporarily hold a copy of normal chars (not needing formatting)
    //    to be passed to printArg() and printed.
    //
    //    The incoming format string may contain a string of normal chars much
    //    longer than 128, but we handle this by breaking them out to 128 chars
    //    fragments and printing each fragment before re-using the buffer to
    //    load up the next fragment.
    //
    // 2. To hold a single "%..." format to be passed to printArg() to process
    //    a single va_arg.

    char buffer[129]; // 128 chars + null terminator.
    char* end = &buffer[sizeof(buffer) - 1];
    const char* startOfFormatSpecifier = 0;

    while (true) {
        char c = *p++;
        char* curr = buffer;

        // Print leading normal chars:
        while (c != '\0' && c != '%') {
            *curr++ = c;
            if (curr == end) {
                // Out of buffer space. Flush the fragment, and start over.
                *curr = '\0';
                bool success = printArg("%s", buffer);
                if (!success) {
                    errorStr = buffer;
                    goto handleError;
                }
                curr = buffer;
            }
            c = *p++;
        }
        // If we have stuff in the buffer, flush the fragment:
        if (curr != buffer) {
            ASSERT(curr < end + 1);
            *curr = '\0';
            bool success = printArg("%s", buffer);
            if (!success) {
                errorStr = buffer;
                goto handleError;
            }
        }

        // End if there are not more chars to print:
        if (c == '\0')
            break;

        // If we get here, we've must have seen a '%':
        startOfFormatSpecifier = p - 1;
        ASSERT(*startOfFormatSpecifier == '%');
        c = *p++;

        // Check for "%%" case:
        if (c == '%') {
            bool success = printArg("%c", '%');
            if (!success) {
                errorStr = p - 2;
                goto handleError;
            }
            continue;
        }

        // Check for JS (%J<x>) formatting extensions:
        if (c == 'J') {
            bool verbose = false;

            c = *p++;
            if (UNLIKELY(c == '\0')) {
                errorStr = p - 2; // Rewind to % in "%J\0"
                goto handleError;
            }

            if (c == '+') {
                verbose = true;
                c= *p++;
                if (UNLIKELY(c == '\0')) {
                    errorStr = p - 3; // Rewind to % in "%J+\0"
                    goto handleError;
                }
            }

            switch (c) {
            // %Js - WTF::String*
            case 's': {
                printWTFString(args, verbose);
                continue;
            }
            } // END switch.

        // Check for non-JS extensions:
        } else if (c == 'b') {
            int value = va_arg(args, int);
            printArg("%s", value ? "TRUE" : "FALSE");
            continue;
        }

        // If we didn't handle the format in one of the above cases,
        // rewind p and let the standard formatting check handle it
        // if possible:
        p = startOfFormatSpecifier;
        ASSERT(*p == '%');

        // Check for standard formatting:
        // A format specifier always starts with a % and ends with some
        // alphabet. We'll do the simple thing and scan until the next
        // alphabet, or the end of string.

        // In the following, we're going to use buffer as storage for a copy
        // of a single format specifier. Hence, conceptually, we can think of
        // 'buffer' as synonymous with 'argFormat' here:

#define ABORT_IF_FORMAT_TOO_LONG(curr) \
        do {                           \
            if (UNLIKELY(curr >= end)) \
                goto formatTooLong;    \
        } while (false)
        
        curr = buffer;
        *curr++ = *p++; // Output the first % in the format specifier.
        c = *p++; // Grab the next char in the format specifier.

        // Checks for leading modifiers e.g. "%-d":
        //     0, -, ' ', +, '\''
        if (c == '0' || c == '-' || c == ' ' || c == '+' || c == '\'' || c == '#') {
            ABORT_IF_FORMAT_TOO_LONG(curr);
            *curr++ = c;
            c = *p++;
        }

        // Checks for decimal digit field width modifiers e.g. "%2f":
        while (c >= '0' && c <= '9') {
            ABORT_IF_FORMAT_TOO_LONG(curr);
            *curr++ = c;
            c = *p++;
        }

        // Checks for '.' e.g. "%2.f":
        if (c == '.') {
            ABORT_IF_FORMAT_TOO_LONG(curr);
            *curr++ = c;
            c = *p++;

            // Checks for decimal digit precision modifiers  e.g. "%.2f":
            while (c >= '0' && c <= '9') {
                ABORT_IF_FORMAT_TOO_LONG(curr);
                *curr++ = c;
                c = *p++;
            }
        }

        // Checks for the modifier <m> where <m> can be:
        //     l, h, j, t, z
        // e.g. "%ld"
        if (c == 'l' || c == 'h' || c == 'j' || c == 't' || c == 'z' || c == 'L') {
            ABORT_IF_FORMAT_TOO_LONG(curr);
            *curr++ = c;
            char prevChar = c;
            c = *p++;

            // Checks for the modifier ll or hh in %<x><m>:
            if ((prevChar == 'l' || prevChar == 'h') && c == prevChar) {
                ABORT_IF_FORMAT_TOO_LONG(curr);
                *curr++ = c;
                c = *p++;
            }
        }

        // Checks for %<x> where <x> can be:
        //     d, i, n, o, u, x, X
        // But hey, we're just going to do the simple thing and allow any
        // alphabet. The user is expected to pass correct format specifiers.
        // We won't do any format checking here. We'll just pass it on, and the
        // underlying ...printf() implementation may do the needed checking
        // at its discretion.
        while (c != '\0' && !isASCIIAlpha(c)) {
            ABORT_IF_FORMAT_TOO_LONG(curr);
            *curr++ = c;
            c = *p++;
        }

        ABORT_IF_FORMAT_TOO_LONG(curr);
        *curr++ = c;
        if (c == '\0') {
            // Uh oh. Bad format. We should have gotten an alphabet instead.
            // Print the supposed format as a string instead:
            errorStr = buffer;
            goto handleError;
        }

        // Otherwise, we have the alpha that terminates the format.
        // Terminate the buffer (i.e. argFormat) string:
        ASSERT(isASCIIAlpha(c));
        ABORT_IF_FORMAT_TOO_LONG(curr);
        *curr = '\0';

        bool success = printArg(buffer, args);
        if (!success) {
            errorStr = buffer;
            goto handleError;
        }
    }
Example #10
0
static inline bool isSafeIdentifierStartCharacter(UChar c)
{
    return isASCIIAlpha(c) || (c == '_') || (c == '$');
}
Example #11
0
static inline bool isIdentStart(int c)
{
    return isASCII(c) ? isASCIIAlpha(c) || c == '$' || c == '_' : isNonASCIIIdentStart(c);
}
Example #12
0
void IntlDateTimeFormat::setFormatsFromPattern(const StringView& pattern)
{
    // Get all symbols from the pattern, and set format fields accordingly.
    // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table
    unsigned length = pattern.length();
    for (unsigned i = 0; i < length; ++i) {
        UChar currentCharacter = pattern[i];
        if (!isASCIIAlpha(currentCharacter))
            continue;

        unsigned count = 1;
        while (i + 1 < length && pattern[i + 1] == currentCharacter) {
            ++count;
            ++i;
        }

        if (currentCharacter == 'h' || currentCharacter == 'K')
            m_hour12 = true;
        else if (currentCharacter == 'H' || currentCharacter == 'k')
            m_hour12 = false;

        switch (currentCharacter) {
        case 'G':
            if (count <= 3)
                m_era = Era::Short;
            else if (count == 4)
                m_era = Era::Long;
            else if (count == 5)
                m_era = Era::Narrow;
            break;
        case 'y':
            if (count == 1)
                m_year = Year::Numeric;
            else if (count == 2)
                m_year = Year::TwoDigit;
            break;
        case 'M':
        case 'L':
            if (count == 1)
                m_month = Month::Numeric;
            else if (count == 2)
                m_month = Month::TwoDigit;
            else if (count == 3)
                m_month = Month::Short;
            else if (count == 4)
                m_month = Month::Long;
            else if (count == 5)
                m_month = Month::Narrow;
            break;
        case 'E':
        case 'e':
        case 'c':
            if (count <= 3)
                m_weekday = Weekday::Short;
            else if (count == 4)
                m_weekday = Weekday::Long;
            else if (count == 5)
                m_weekday = Weekday::Narrow;
            break;
        case 'd':
            if (count == 1)
                m_day = Day::Numeric;
            else if (count == 2)
                m_day = Day::TwoDigit;
            break;
        case 'h':
        case 'H':
        case 'k':
        case 'K':
            if (count == 1)
                m_hour = Hour::Numeric;
            else if (count == 2)
                m_hour = Hour::TwoDigit;
            break;
        case 'm':
            if (count == 1)
                m_minute = Minute::Numeric;
            else if (count == 2)
                m_minute = Minute::TwoDigit;
            break;
        case 's':
            if (count == 1)
                m_second = Second::Numeric;
            else if (count == 2)
                m_second = Second::TwoDigit;
            break;
        case 'z':
        case 'v':
        case 'V':
            if (count == 1)
                m_timeZoneName = TimeZoneName::Short;
            else if (count == 4)
                m_timeZoneName = TimeZoneName::Long;
            break;
        }
    }
}
Example #13
0
static bool isASCIIAlphabetOrQuote(UChar ch)
{
    return isASCIIAlpha(ch) || ch == '\'';
}
Example #14
0
Escape Parser::consumeEscape(bool inCharacterClass)
{
    switch (peek()) {
    case EndOfPattern:
        setError(EscapeUnterminated);
        return Escape(Escape::Error);

    // Assertions
    case 'b':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('\b');
        return WordBoundaryAssertionEscape(false); // do not invert
    case 'B':
        consume();
        if (inCharacterClass)
            return PatternCharacterEscape('B');
        return WordBoundaryAssertionEscape(true); // invert

    // CharacterClassEscape
    case 'd':
        consume();
        return CharacterClassEscape(CharacterClass::digits(), false);
    case 's':
        consume();
        return CharacterClassEscape(CharacterClass::spaces(), false);
    case 'w':
        consume();
        return CharacterClassEscape(CharacterClass::wordchar(), false);
    case 'D':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nondigits(), false)
            : CharacterClassEscape(CharacterClass::digits(), true);
    case 'S':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonspaces(), false)
            : CharacterClassEscape(CharacterClass::spaces(), true);
    case 'W':
        consume();
        return inCharacterClass
            ? CharacterClassEscape(CharacterClass::nonwordchar(), false)
            : CharacterClassEscape(CharacterClass::wordchar(), true);

    // DecimalEscape
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9': {
        if (peekDigit() > m_numSubpatterns || inCharacterClass) {
            // To match Firefox, we parse an invalid backreference in the range [1-7]
            // as an octal escape.
            return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal());
        }

        int value = 0;
        do {
            unsigned newValue = value * 10 + peekDigit();
            if (newValue > m_numSubpatterns)
                break;
            value = newValue;
            consume();
        } while (peekIsDigit());

        return BackreferenceEscape(value);
    }

    // Octal escape
    case '0':
        consume();
        return PatternCharacterEscape(consumeOctal());

    // ControlEscape
    case 'f':
        consume();
        return PatternCharacterEscape('\f');
    case 'n':
        consume();
        return PatternCharacterEscape('\n');
    case 'r':
        consume();
        return PatternCharacterEscape('\r');
    case 't':
        consume();
        return PatternCharacterEscape('\t');
    case 'v':
        consume();
        return PatternCharacterEscape('\v');

    // ControlLetter
    case 'c': {
        SavedState state(*this);
        consume();
        
        int control = consume();
        if (!isASCIIAlpha(control)) {
            state.restore();
            return PatternCharacterEscape('\\');
        }
        return PatternCharacterEscape(control & 31);
    }

    // HexEscape
    case 'x': {
        consume();

        SavedState state(*this);
        int x = consumeHex(2);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('x');
        }
        return PatternCharacterEscape(x);
    }

    // UnicodeEscape
    case 'u': {
        consume();

        SavedState state(*this);
        int x = consumeHex(4);
        if (x == -1) {
            state.restore();
            return PatternCharacterEscape('u');
        }
        return PatternCharacterEscape(x);
    }

    // IdentityEscape
    default:
        return PatternCharacterEscape(consume());
    }
}
Example #15
0
template<typename CharType> inline bool isASCIIAlphanumeric(CharType c)
{
    return isASCIIDigit(c) || isASCIIAlpha(c);
}