// See: https://tools.ietf.org/html/bcp47#section-2.1 static bool isValidBCP47LanguageTag(const String& languageTag) { auto const length = languageTag.length(); // Max length picked as double the longest example tag in spec which is 49 characters: // https://tools.ietf.org/html/bcp47#section-4.4.2 if (length < 2 || length > 100) return false; UChar firstChar = languageTag[0]; if (!isASCIIAlpha(firstChar)) return false; UChar secondChar = languageTag[1]; if (length == 2) return isASCIIAlpha(secondChar); bool grandFatheredIrregularOrPrivateUse = (firstChar == 'i' || firstChar == 'x') && secondChar == '-'; unsigned nextCharIndexToCheck; if (!grandFatheredIrregularOrPrivateUse) { if (!isASCIIAlpha(secondChar)) return false; if (length == 3) return isASCIIAlpha(languageTag[2]); if (isASCIIAlpha(languageTag[2])) { if (languageTag[3] == '-') nextCharIndexToCheck = 4; else return false; } else if (languageTag[2] == '-') nextCharIndexToCheck = 3; else return false; } else nextCharIndexToCheck = 2; for (; nextCharIndexToCheck < length; ++nextCharIndexToCheck) { UChar c = languageTag[nextCharIndexToCheck]; if (isASCIIAlphanumeric(c) || c == '-') continue; return false; } return true; }
// This function converts Windows date/time pattern format [1][2] into LDML date // format pattern [3]. // // i.e. // We set h, H, m, s, d, dd, M, or y as is. They have same meaning in both of // Windows and LDML. // We need to convert the following patterns: // t -> a // tt -> a // ddd -> EEE // dddd -> EEEE // g -> G // gg -> ignore // // [1] http://msdn.microsoft.com/en-us/library/dd317787(v=vs.85).aspx // [2] http://msdn.microsoft.com/en-us/library/dd318148(v=vs.85).aspx // [3] LDML http://unicode.org/reports/tr35/tr35-6.html#Date_Format_Patterns static String convertWindowsDateTimeFormat(const String& format) { StringBuilder converted; StringBuilder literalBuffer; bool inQuote = false; bool lastQuoteCanBeLiteral = false; for (unsigned i = 0; i < format.length(); ++i) { UChar ch = format[i]; if (inQuote) { if (ch == '\'') { inQuote = false; ASSERT(i); if (lastQuoteCanBeLiteral && format[i - 1] == '\'') { literalBuffer.append('\''); lastQuoteCanBeLiteral = false; } else lastQuoteCanBeLiteral = true; } else literalBuffer.append(ch); continue; } if (ch == '\'') { inQuote = true; if (lastQuoteCanBeLiteral && i > 0 && format[i - 1] == '\'') { literalBuffer.append(ch); lastQuoteCanBeLiteral = false; } else lastQuoteCanBeLiteral = true; } else if (isASCIIAlpha(ch)) { commitLiteralToken(literalBuffer, converted); unsigned symbolStart = i; unsigned count = countContinuousLetters(format, i); i += count - 1; if (ch == 'h' || ch == 'H' || ch == 'm' || ch == 's' || ch == 'M' || ch == 'y') converted.append(format, symbolStart, count); else if (ch == 'd') { if (count <= 2) converted.append(format, symbolStart, count); else if (count == 3) converted.append("EEE"); else converted.append("EEEE"); } else if (ch == 'g') { if (count == 1) converted.append('G'); else { // gg means imperial era in Windows. // Just ignore it. } } else if (ch == 't') converted.append('a'); else literalBuffer.append(format, symbolStart, count); } else literalBuffer.append(ch); } commitLiteralToken(literalBuffer, converted); return converted.toString(); }
// http://dev.w3.org/csswg/css-syntax/#name-start-code-point static bool isNameStart(UChar c) { if (isASCIIAlpha(c)) return true; if (c == '_') return true; return !isASCII(c); }
static bool isCharacterAllowedInBase(UChar c, int base) { if (c > 0x7F) return false; if (isASCIIDigit(c)) return c - '0' < base; if (isASCIIAlpha(c)) { if (base > 36) base = 36; return (c >= 'a' && c < 'a' + base - 10) || (c >= 'A' && c < 'A' + base - 10); } return false; }
TEST(MediaQueryTokenizerCodepointsTest, Basic) { for (UChar c = 0; c <= 1000; ++c) { if (isASCIIDigit(c)) testToken(c, NumberToken); else if (isASCIIAlpha(c)) testToken(c, IdentToken); else if (c == '_') testToken(c, IdentToken); else if (c == '\r' || c == ' ' || c == '\n' || c == '\t' || c == '\f') testToken(c, WhitespaceToken); else if (c == '(') testToken(c, LeftParenthesisToken); else if (c == ')') testToken(c, RightParenthesisToken); else if (c == '[') testToken(c, LeftBracketToken); else if (c == ']') testToken(c, RightBracketToken); else if (c == '{') testToken(c, LeftBraceToken); else if (c == '}') testToken(c, RightBraceToken); else if (c == '.' || c == '+' || c == '-' || c == '/' || c == '\\') testToken(c, DelimiterToken); else if (c == '\'' || c == '"') testToken(c, StringToken); else if (c == ',') testToken(c, CommaToken); else if (c == ':') testToken(c, ColonToken); else if (c == ';') testToken(c, SemicolonToken); else if (!c) testToken(c, EOFToken); else if (c > SCHAR_MAX) testToken(c, IdentToken); else testToken(c, DelimiterToken); } testToken(USHRT_MAX, IdentToken); }
static inline bool isCSSTokenizerIdentifier(const CharacterType* characters, unsigned length) { const CharacterType* end = characters + length; // -? if (characters != end && characters[0] == '-') ++characters; // {nmstart} if (characters == end || !(characters[0] == '_' || characters[0] >= 128 || isASCIIAlpha(characters[0]))) return false; ++characters; // {nmchar}* for (; characters != end; ++characters) { if (!(characters[0] == '_' || characters[0] == '-' || characters[0] >= 128 || isASCIIAlphanumeric(characters[0]))) return false; } return true; }
// "ident" from the CSS tokenizer, minus backslash-escape sequences static bool isCSSTokenizerIdentifier(const String& string) { const UChar* p = string.characters(); const UChar* end = p + string.length(); // -? if (p != end && p[0] == '-') ++p; // {nmstart} if (p == end || !(p[0] == '_' || p[0] >= 128 || isASCIIAlpha(p[0]))) return false; ++p; // {nmchar}* for (; p != end; ++p) { if (!(p[0] == '_' || p[0] == '-' || p[0] >= 128 || isASCIIAlphanumeric(p[0]))) return false; } return true; }
// Specification of the input: // http://icu-project.org/apiref/icu4c/classSimpleDateFormat.html#details static String localizeFormat(const Vector<UChar>& buffer) { StringBuilder builder; UChar lastChar = 0; bool inQuote = false; for (unsigned i = 0; i < buffer.size(); ++i) { if (inQuote) { if (buffer[i] == '\'') { inQuote = false; lastChar = 0; ASSERT(i); if (buffer[i - 1] == '\'') builder.append('\''); } else builder.append(buffer[i]); } else { if (isASCIIAlpha(lastChar) && lastChar == buffer[i]) continue; lastChar = buffer[i]; if (isICUYearSymbol(lastChar)) { String text = dateFormatYearText(); builder.append(text.isEmpty() ? "Year" : text); } else if (isICUMonthSymbol(lastChar)) { String text = dateFormatMonthText(); builder.append(text.isEmpty() ? "Month" : text); } else if (isICUDayInMonthSymbol(lastChar)) { String text = dateFormatDayInMonthText(); builder.append(text.isEmpty() ? "Day" : text); } else if (lastChar == '\'') inQuote = true; else builder.append(lastChar); } } return builder.toString(); }
void FormatPrinter::print(const char* format, va_list args) { const char* p = format; const char* errorStr; // buffer is only used for 2 purposes: // 1. To temporarily hold a copy of normal chars (not needing formatting) // to be passed to printArg() and printed. // // The incoming format string may contain a string of normal chars much // longer than 128, but we handle this by breaking them out to 128 chars // fragments and printing each fragment before re-using the buffer to // load up the next fragment. // // 2. To hold a single "%..." format to be passed to printArg() to process // a single va_arg. char buffer[129]; // 128 chars + null terminator. char* end = &buffer[sizeof(buffer) - 1]; const char* startOfFormatSpecifier = 0; while (true) { char c = *p++; char* curr = buffer; // Print leading normal chars: while (c != '\0' && c != '%') { *curr++ = c; if (curr == end) { // Out of buffer space. Flush the fragment, and start over. *curr = '\0'; bool success = printArg("%s", buffer); if (!success) { errorStr = buffer; goto handleError; } curr = buffer; } c = *p++; } // If we have stuff in the buffer, flush the fragment: if (curr != buffer) { ASSERT(curr < end + 1); *curr = '\0'; bool success = printArg("%s", buffer); if (!success) { errorStr = buffer; goto handleError; } } // End if there are not more chars to print: if (c == '\0') break; // If we get here, we've must have seen a '%': startOfFormatSpecifier = p - 1; ASSERT(*startOfFormatSpecifier == '%'); c = *p++; // Check for "%%" case: if (c == '%') { bool success = printArg("%c", '%'); if (!success) { errorStr = p - 2; goto handleError; } continue; } // Check for JS (%J<x>) formatting extensions: if (c == 'J') { bool verbose = false; c = *p++; if (UNLIKELY(c == '\0')) { errorStr = p - 2; // Rewind to % in "%J\0" goto handleError; } if (c == '+') { verbose = true; c= *p++; if (UNLIKELY(c == '\0')) { errorStr = p - 3; // Rewind to % in "%J+\0" goto handleError; } } switch (c) { // %Js - WTF::String* case 's': { printWTFString(args, verbose); continue; } } // END switch. // Check for non-JS extensions: } else if (c == 'b') { int value = va_arg(args, int); printArg("%s", value ? "TRUE" : "FALSE"); continue; } // If we didn't handle the format in one of the above cases, // rewind p and let the standard formatting check handle it // if possible: p = startOfFormatSpecifier; ASSERT(*p == '%'); // Check for standard formatting: // A format specifier always starts with a % and ends with some // alphabet. We'll do the simple thing and scan until the next // alphabet, or the end of string. // In the following, we're going to use buffer as storage for a copy // of a single format specifier. Hence, conceptually, we can think of // 'buffer' as synonymous with 'argFormat' here: #define ABORT_IF_FORMAT_TOO_LONG(curr) \ do { \ if (UNLIKELY(curr >= end)) \ goto formatTooLong; \ } while (false) curr = buffer; *curr++ = *p++; // Output the first % in the format specifier. c = *p++; // Grab the next char in the format specifier. // Checks for leading modifiers e.g. "%-d": // 0, -, ' ', +, '\'' if (c == '0' || c == '-' || c == ' ' || c == '+' || c == '\'' || c == '#') { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; } // Checks for decimal digit field width modifiers e.g. "%2f": while (c >= '0' && c <= '9') { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; } // Checks for '.' e.g. "%2.f": if (c == '.') { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; // Checks for decimal digit precision modifiers e.g. "%.2f": while (c >= '0' && c <= '9') { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; } } // Checks for the modifier <m> where <m> can be: // l, h, j, t, z // e.g. "%ld" if (c == 'l' || c == 'h' || c == 'j' || c == 't' || c == 'z' || c == 'L') { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; char prevChar = c; c = *p++; // Checks for the modifier ll or hh in %<x><m>: if ((prevChar == 'l' || prevChar == 'h') && c == prevChar) { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; } } // Checks for %<x> where <x> can be: // d, i, n, o, u, x, X // But hey, we're just going to do the simple thing and allow any // alphabet. The user is expected to pass correct format specifiers. // We won't do any format checking here. We'll just pass it on, and the // underlying ...printf() implementation may do the needed checking // at its discretion. while (c != '\0' && !isASCIIAlpha(c)) { ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; c = *p++; } ABORT_IF_FORMAT_TOO_LONG(curr); *curr++ = c; if (c == '\0') { // Uh oh. Bad format. We should have gotten an alphabet instead. // Print the supposed format as a string instead: errorStr = buffer; goto handleError; } // Otherwise, we have the alpha that terminates the format. // Terminate the buffer (i.e. argFormat) string: ASSERT(isASCIIAlpha(c)); ABORT_IF_FORMAT_TOO_LONG(curr); *curr = '\0'; bool success = printArg(buffer, args); if (!success) { errorStr = buffer; goto handleError; } }
static inline bool isSafeIdentifierStartCharacter(UChar c) { return isASCIIAlpha(c) || (c == '_') || (c == '$'); }
static inline bool isIdentStart(int c) { return isASCII(c) ? isASCIIAlpha(c) || c == '$' || c == '_' : isNonASCIIIdentStart(c); }
void IntlDateTimeFormat::setFormatsFromPattern(const StringView& pattern) { // Get all symbols from the pattern, and set format fields accordingly. // http://unicode.org/reports/tr35/tr35-dates.html#Date_Field_Symbol_Table unsigned length = pattern.length(); for (unsigned i = 0; i < length; ++i) { UChar currentCharacter = pattern[i]; if (!isASCIIAlpha(currentCharacter)) continue; unsigned count = 1; while (i + 1 < length && pattern[i + 1] == currentCharacter) { ++count; ++i; } if (currentCharacter == 'h' || currentCharacter == 'K') m_hour12 = true; else if (currentCharacter == 'H' || currentCharacter == 'k') m_hour12 = false; switch (currentCharacter) { case 'G': if (count <= 3) m_era = Era::Short; else if (count == 4) m_era = Era::Long; else if (count == 5) m_era = Era::Narrow; break; case 'y': if (count == 1) m_year = Year::Numeric; else if (count == 2) m_year = Year::TwoDigit; break; case 'M': case 'L': if (count == 1) m_month = Month::Numeric; else if (count == 2) m_month = Month::TwoDigit; else if (count == 3) m_month = Month::Short; else if (count == 4) m_month = Month::Long; else if (count == 5) m_month = Month::Narrow; break; case 'E': case 'e': case 'c': if (count <= 3) m_weekday = Weekday::Short; else if (count == 4) m_weekday = Weekday::Long; else if (count == 5) m_weekday = Weekday::Narrow; break; case 'd': if (count == 1) m_day = Day::Numeric; else if (count == 2) m_day = Day::TwoDigit; break; case 'h': case 'H': case 'k': case 'K': if (count == 1) m_hour = Hour::Numeric; else if (count == 2) m_hour = Hour::TwoDigit; break; case 'm': if (count == 1) m_minute = Minute::Numeric; else if (count == 2) m_minute = Minute::TwoDigit; break; case 's': if (count == 1) m_second = Second::Numeric; else if (count == 2) m_second = Second::TwoDigit; break; case 'z': case 'v': case 'V': if (count == 1) m_timeZoneName = TimeZoneName::Short; else if (count == 4) m_timeZoneName = TimeZoneName::Long; break; } } }
static bool isASCIIAlphabetOrQuote(UChar ch) { return isASCIIAlpha(ch) || ch == '\''; }
Escape Parser::consumeEscape(bool inCharacterClass) { switch (peek()) { case EndOfPattern: setError(EscapeUnterminated); return Escape(Escape::Error); // Assertions case 'b': consume(); if (inCharacterClass) return PatternCharacterEscape('\b'); return WordBoundaryAssertionEscape(false); // do not invert case 'B': consume(); if (inCharacterClass) return PatternCharacterEscape('B'); return WordBoundaryAssertionEscape(true); // invert // CharacterClassEscape case 'd': consume(); return CharacterClassEscape(CharacterClass::digits(), false); case 's': consume(); return CharacterClassEscape(CharacterClass::spaces(), false); case 'w': consume(); return CharacterClassEscape(CharacterClass::wordchar(), false); case 'D': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nondigits(), false) : CharacterClassEscape(CharacterClass::digits(), true); case 'S': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nonspaces(), false) : CharacterClassEscape(CharacterClass::spaces(), true); case 'W': consume(); return inCharacterClass ? CharacterClassEscape(CharacterClass::nonwordchar(), false) : CharacterClassEscape(CharacterClass::wordchar(), true); // DecimalEscape case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { if (peekDigit() > m_numSubpatterns || inCharacterClass) { // To match Firefox, we parse an invalid backreference in the range [1-7] // as an octal escape. return peekDigit() > 7 ? PatternCharacterEscape('\\') : PatternCharacterEscape(consumeOctal()); } int value = 0; do { unsigned newValue = value * 10 + peekDigit(); if (newValue > m_numSubpatterns) break; value = newValue; consume(); } while (peekIsDigit()); return BackreferenceEscape(value); } // Octal escape case '0': consume(); return PatternCharacterEscape(consumeOctal()); // ControlEscape case 'f': consume(); return PatternCharacterEscape('\f'); case 'n': consume(); return PatternCharacterEscape('\n'); case 'r': consume(); return PatternCharacterEscape('\r'); case 't': consume(); return PatternCharacterEscape('\t'); case 'v': consume(); return PatternCharacterEscape('\v'); // ControlLetter case 'c': { SavedState state(*this); consume(); int control = consume(); if (!isASCIIAlpha(control)) { state.restore(); return PatternCharacterEscape('\\'); } return PatternCharacterEscape(control & 31); } // HexEscape case 'x': { consume(); SavedState state(*this); int x = consumeHex(2); if (x == -1) { state.restore(); return PatternCharacterEscape('x'); } return PatternCharacterEscape(x); } // UnicodeEscape case 'u': { consume(); SavedState state(*this); int x = consumeHex(4); if (x == -1) { state.restore(); return PatternCharacterEscape('u'); } return PatternCharacterEscape(x); } // IdentityEscape default: return PatternCharacterEscape(consume()); } }
template<typename CharType> inline bool isASCIIAlphanumeric(CharType c) { return isASCIIDigit(c) || isASCIIAlpha(c); }