bool equalLatin1WithUTF8(const LChar* a, const char* b, const char* bEnd) { while (b < bEnd) { if (isASCII(*a) || isASCII(*b)) { if (*a++ != *b++) return false; continue; } if (b + 1 == bEnd) return false; if ((b[0] & 0xE0) != 0xC0 || (b[1] & 0xC0) != 0x80) return false; LChar character = ((b[0] & 0x1F) << 6) | (b[1] & 0x3F); b += 2; if (*a++ != character) return false; } return true; }
unsigned calculateStringHashAndLengthFromUTF8MaskingTop8Bits(const char* data, const char* dataEnd, unsigned& dataLength, unsigned& utf16Length) { if (!data) return 0; StringHasher stringHasher; dataLength = 0; utf16Length = 0; while (data < dataEnd || (!dataEnd && *data)) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); dataLength++; utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); dataLength += utf8SequenceLength; if (!dataEnd) { for (int i = 1; i < utf8SequenceLength; ++i) { if (!data[i]) return 0; } } else if (dataEnd - data < utf8SequenceLength) { return 0; } if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else { return 0; } } return stringHasher.hashWithTop8BitsMasked(); }
uint32 UString::toUpper(uint32 c) { if (!isASCII(c)) // We don't know how to uppercase that return c; return std::toupper(c); }
void CombinedURLFilters::addDomain(uint64_t actionId, const String& domain) { // This is like adding (.|^)domain$ by adding two Vector<Term>'s, // but interpreting domain as a series of characters, not a regular expression. // This way a domain of "webkit.org" will match "bugs.webkit.org" and "webkit.org". // FIXME: Add support for matching only subdomains or no subdomains. Vector<Term> prependDot; Vector<Term> prependBeginningOfLine; prependDot.reserveInitialCapacity(domain.length() + 3); prependBeginningOfLine.reserveInitialCapacity(domain.length() + 1); // This is just no .* at the beginning. Term canonicalDotStar(Term::UniversalTransition); canonicalDotStar.quantify(AtomQuantifier::ZeroOrMore); prependDot.uncheckedAppend(canonicalDotStar); prependDot.uncheckedAppend(Term('.', true)); for (unsigned i = 0; i < domain.length(); i++) { ASSERT(isASCII(domain[i])); ASSERT(!isASCIIUpper(domain[i])); prependDot.uncheckedAppend(Term(domain[i], true)); prependBeginningOfLine.uncheckedAppend(Term(domain[i], true)); } prependDot.uncheckedAppend(Term::EndOfLineAssertionTerm); prependBeginningOfLine.uncheckedAppend(Term::EndOfLineAssertionTerm); addPattern(actionId, prependDot); addPattern(actionId, prependBeginningOfLine); }
// http://dev.w3.org/csswg/css-syntax/#name-start-code-point static bool isNameStart(UChar c) { if (isASCIIAlpha(c)) return true; if (c == '_') return true; return !isASCII(c); }
static bool containsOnlyASCIIWithNoUppercase(const String& domain) { for (unsigned i = 0; i < domain.length(); ++i) { UChar c = domain.at(i); if (!isASCII(c) || isASCIIUpper(c)) return false; } return true; }
STATIC char S_grok_bslash_c(pTHX_ const char source, const bool utf8, const bool output_warning) { U8 result; if (utf8) { /* Trying to deprecate non-ASCII usages. This construct has never * worked for a utf8 variant. So, even though are accepting non-ASCII * Latin1 in 5.14, no need to make them work under utf8 */ if (! isASCII(source)) { Perl_croak(aTHX_ "Character following \"\\c\" must be ASCII"); } } result = toCTRL(source); if (! isASCII(source)) { Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "Character following \"\\c\" must be ASCII"); } else if (! isCNTRL(result) && output_warning) { if (source == '{') { Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_SYNTAX), "\"\\c{\" is deprecated and is more clearly written as \";\""); } else { U8 clearer[3]; U8 i = 0; if (! isALNUM(result)) { clearer[i++] = '\\'; } clearer[i++] = result; clearer[i++] = '\0'; Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "\"\\c%c\" is more clearly written simply as \"%s\"", source, clearer); } } return result; }
void expand(char s1[], char s2[]) { int i=0, j=0; char c, k; while ((c=s1[i]) != '\0') { if (isASCII(c)) { if (s1[i+1] == '-') { /* is next char a A-Za-z0-9 ? */ if (isASCII(s1[i+2])) { /* expand */ if (s1[i]<s1[i+2]) { for (k=s1[i]; k<s1[i+2]; k=nextASCII(k)) { s2[j++] = k; } i += 2; } } else if (s1[i+2] == '\0') { /* end of string reached */ s2[j++] = s1[i+1]; /* copy '-' */ s2[j++] = '\0'; break; } } else if (s1[i+1] == '\0') { /* end of string reached */ s2[j++] = '\0'; break; } } s2[j++] = s1[i]; i++; } s2[j] = '\0'; }
unsigned calculateStringHashFromUTF8(const char* data, const char* dataEnd, unsigned& utf16Length) { if (!data) return 0; WTF::StringHasher stringHasher; utf16Length = 0; while (data < dataEnd) { if (isASCII(*data)) { stringHasher.addCharacter(*data++); utf16Length++; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*data); if (dataEnd - data < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(data), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(data, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return 0; stringHasher.addCharacter(static_cast<UChar>(character)); // normal case utf16Length++; } else if (U_IS_SUPPLEMENTARY(character)) { stringHasher.addCharacters(static_cast<UChar>(U16_LEAD(character)), static_cast<UChar>(U16_TRAIL(character))); utf16Length += 2; } else return 0; } return stringHasher.hash(); }
static int checkEnding(char c, char *filename) { int i = 0; while (filename[i] != '.') { if (filename[i] == '\0' || isASCII(filename+i)==0) return 0; i++; } i++; if ( filename[i] == c && filename[i+1] == '\0' ) return 1; return 0; }
static String createSearchRegexSource(const String& text) { StringBuilder result; for (unsigned i = 0; i < text.length(); i++) { UChar character = text[i]; if (isASCII(character) && strchr(regexSpecialCharacters, character)) result.append('\\'); result.append(character); } return result.toString(); }
static inline int decodeNonASCIISequence(const uint8_t* sequence, unsigned length) { ASSERT(!isASCII(sequence[0])); if (length == 2) { ASSERT(sequence[0] <= 0xDF); if (sequence[0] < 0xC2) return nonCharacter; if (sequence[1] < 0x80 || sequence[1] > 0xBF) return nonCharacter; return ((sequence[0] << 6) + sequence[1]) - 0x00003080; } if (length == 3) { ASSERT(sequence[0] >= 0xE0 && sequence[0] <= 0xEF); switch (sequence[0]) { case 0xE0: if (sequence[1] < 0xA0 || sequence[1] > 0xBF) return nonCharacter; break; case 0xED: if (sequence[1] < 0x80 || sequence[1] > 0x9F) return nonCharacter; break; default: if (sequence[1] < 0x80 || sequence[1] > 0xBF) return nonCharacter; } if (sequence[2] < 0x80 || sequence[2] > 0xBF) return nonCharacter; return ((sequence[0] << 12) + (sequence[1] << 6) + sequence[2]) - 0x000E2080; } ASSERT(length == 4); ASSERT(sequence[0] >= 0xF0 && sequence[0] <= 0xF4); switch (sequence[0]) { case 0xF0: if (sequence[1] < 0x90 || sequence[1] > 0xBF) return nonCharacter; break; case 0xF4: if (sequence[1] < 0x80 || sequence[1] > 0x8F) return nonCharacter; break; default: if (sequence[1] < 0x80 || sequence[1] > 0xBF) return nonCharacter; } if (sequence[2] < 0x80 || sequence[2] > 0xBF) return nonCharacter; if (sequence[3] < 0x80 || sequence[3] > 0xBF) return nonCharacter; return ((sequence[0] << 18) + (sequence[1] << 12) + (sequence[2] << 6) + sequence[3]) - 0x03C82080; }
ALWAYS_INLINE bool equalWithUTF8Internal(const CharType* a, const CharType* aEnd, const char* b, const char* bEnd) { while (b < bEnd) { if (isASCII(*b)) { if (*a++ != *b++) return false; continue; } int utf8SequenceLength = inlineUTF8SequenceLengthNonASCII(*b); if (bEnd - b < utf8SequenceLength) return false; if (!isLegalUTF8(reinterpret_cast<const unsigned char*>(b), utf8SequenceLength)) return 0; UChar32 character = readUTF8Sequence(b, utf8SequenceLength); ASSERT(!isASCII(character)); if (U_IS_BMP(character)) { // UTF-16 surrogate values are illegal in UTF-32 if (U_IS_SURROGATE(character)) return false; if (*a++ != character) return false; } else if (U_IS_SUPPLEMENTARY(character)) { if (*a++ != U16_LEAD(character)) return false; if (*a++ != U16_TRAIL(character)) return false; } else { return false; } } return a == aEnd; }
bool TextCodecUTF8::handlePartialSequence<UChar>(UChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool stopOnError, bool& sawError) { ASSERT(m_partialSequenceSize); do { if (isASCII(m_partialSequence[0])) { *destination++ = m_partialSequence[0]; consumePartialSequenceByte(); continue; } int count = nonASCIISequenceLength(m_partialSequence[0]); if (!count) { handleError(destination, stopOnError, sawError); if (stopOnError) return false; continue; } if (count > m_partialSequenceSize) { if (count - m_partialSequenceSize > end - source) { if (!flush) { // The new data is not enough to complete the sequence, so // add it to the existing partial sequence. memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); m_partialSequenceSize += end - source; return false; } // An incomplete partial sequence at the end is an error. handleError(destination, stopOnError, sawError); if (stopOnError) return false; continue; } memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); source += count - m_partialSequenceSize; m_partialSequenceSize = count; } int character = decodeNonASCIISequence(m_partialSequence, count); if (character == nonCharacter) { handleError(destination, stopOnError, sawError); if (stopOnError) return false; continue; } m_partialSequenceSize -= count; destination = appendCharacter(destination, character); } while (m_partialSequenceSize); return false; }
int main(int argc, char *argv[]){ FILE *fp = fopen(argv[1], "r"); char buffer[5], curChar, preChar; int i = 0, length = 0, I = 0, file_size = 0; printf("\nFilename: %s\n", argv[1]); printf("-----------------------\n"); fseek(fp, 0L, SEEK_END); file_size = ftell(fp); fseek(fp, 0L, SEEK_SET); while(I <= file_size){ curChar = fgetc(fp); if(curChar == NEWLINE){ length = i; for(i = 0; i < length; i++){ printf("%c", buffer[i]); } printf("%c", curChar); i = 0; }else if(isASCII(curChar)){ buffer[i] = curChar; i++; }else if(i >= 4){ length = i; for(i = 0; i < length; i++){ printf("%c", buffer[i]); } i = 0; }else{ i = 0; } preChar = curChar; I++; } return 0; }
// function to read regular file static int readfile(const char *pathname, char *searchstr, const struct stat *statptr, int type) { // file descriptor int fd; char charbuf[1]; char linebuf[LINE_MAX]; int lbpos = 0; int errnum; // error handling: if fd==-1, error opening file if ( (fd = open(pathname, O_RDONLY)) < 0 ) { errnum = errno; my_errprintf("Error opening file: %s\n", strerror(errnum) ); } // Copy one line to buffer by reading file one byte // at a time until a newline character is reached // OR buffer is full while ( read(fd, charbuf, 1) > 0 ) { if ( isASCII(charbuf) == 0 ) // contains non-ascii char, skip file break; // New line character... // -put null char at end of line // then pass to mygrep and print line if match if ( (charbuf[0] == '\n') || (lbpos >= LINE_MAX-1)) { linebuf[lbpos] = '\0'; if ( mygrep(searchstr, strlen(searchstr), linebuf, strlen(linebuf)) == 1 ) { my_printf("Line: %s\nFile: %s\n", linebuf, pathname); } lbpos = 0; } else { // store byte in linebuf linebuf[lbpos] = (char) charbuf[0]; lbpos++ ; } } close(fd); return 0; }
static char *findString(char *data, char *maxData) { int length = *(data++); int i; char *name; if (length <= 0) return NULL; for (i = 0; i < length; i++) { if (!isASCII(data[i])) return NULL; } name = (char *)malloc(length + 1); memcpy(name, data, length); name[length] = '\0'; return name; }
Znk_INLINE bool SJIS_isSecondByte( const char* base, const char* p ) { int lbc = 0; while( p > base ){ --p; if( isASCII(*p) ){ /* second or ascii */ break; } else if( SJIS_isHankakuKatakana(*p) ){ /* second or hankaku_katakana */ break; } else { /* second or first */ } ++lbc; } return (bool)(lbc & 1); }
bool TextCodecUTF8::handlePartialSequence<LChar>(LChar*& destination, const uint8_t*& source, const uint8_t* end, bool flush, bool, bool&) { ASSERT(m_partialSequenceSize); do { if (isASCII(m_partialSequence[0])) { *destination++ = m_partialSequence[0]; consumePartialSequenceByte(); continue; } int count = nonASCIISequenceLength(m_partialSequence[0]); if (!count) return true; if (count > m_partialSequenceSize) { if (count - m_partialSequenceSize > end - source) { if (!flush) { // The new data is not enough to complete the sequence, so // add it to the existing partial sequence. memcpy(m_partialSequence + m_partialSequenceSize, source, end - source); m_partialSequenceSize += end - source; return false; } // An incomplete partial sequence at the end is an error, but it will create // a 16 bit string due to the replacementCharacter. Let the 16 bit path handle // the error. return true; } memcpy(m_partialSequence + m_partialSequenceSize, source, count - m_partialSequenceSize); source += count - m_partialSequenceSize; m_partialSequenceSize = count; } int character = decodeNonASCIISequence(m_partialSequence, count); if ((character == nonCharacter) || (character > 0xff)) return true; m_partialSequenceSize -= count; *destination++ = character; } while (m_partialSequenceSize); return false; }
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&) { UChar* characters; String result = String::createUninitialized(length, characters); const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length); const uint8_t* alignedEnd = alignToMachineWord(end); UChar* destination = characters; while (source < end) { if (isASCII(*source)) { // Fast path for ASCII. Most Latin-1 text will be ASCII. if (isAlignedToMachineWord(source)) { while (source < alignedEnd) { MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); if (!isAllASCII<LChar>(chunk)) goto useLookupTable; copyASCIIMachineWord(destination, source); source += sizeof(MachineWord); destination += sizeof(MachineWord); } if (source == end) break; } *destination = *source; } else { useLookupTable: *destination = table[*source]; } ++source; ++destination; } return result; }
void guess_encoding(const char *data, size_t size) { #define FOUND(name) \ do { \ if (found) fputs(", ", stdout); \ fputs(name, stdout); \ found = 1; \ } while (0) int found = 0; dump_byte_string(stdout, "guess_encoding(\"", data, size, "\"): "); if (size >= 3) { if (memcmp(data, UTF_8_BOM, 3) == 0) FOUND("UTF-8 (BOM)"); } if (size >= 4) { if (memcmp(data, UTF_32_LE_BOM, 4) == 0) FOUND("UTF-32-LE (BOM)"); if (memcmp(data, UTF_32_BE_BOM, 4) == 0) FOUND("UTF-32-BE (BOM)"); } if (size >= 2) { if (memcmp(data, UTF_16_LE_BOM, 2) == 0) FOUND("UTF-16-LE (BOM)"); if (memcmp(data, UTF_16_BE_BOM, 2) == 0) FOUND("UTF-16-BE (BOM)"); } if (isASCII(data, size)) { FOUND("ASCII"); } if (isUTF8(data, size)) { FOUND("UTF-8"); } if (!found) printf("<unknown>"); fputs("\n", stdout); }
CSSParserToken CSSTokenizer::nextToken() { // Unlike the HTMLTokenizer, the CSS Syntax spec is written // as a stateless, (fixed-size) look-ahead tokenizer. // We could move to the stateful model and instead create // states for all the "next 3 codepoints are X" cases. // State-machine tokenizers are easier to write to handle // incremental tokenization of partial sources. // However, for now we follow the spec exactly. UChar cc = consume(); CodePoint codePointFunc = 0; if (isASCII(cc)) { ASSERT_WITH_SECURITY_IMPLICATION(cc < codePointsNumber); codePointFunc = codePoints[cc]; } else { codePointFunc = &CSSTokenizer::nameStart; } if (codePointFunc) return ((this)->*(codePointFunc))(cc); return CSSParserToken(DelimiterToken, cc); }
String TextCodecLatin1::decode(const char* bytes, size_t length, bool, bool, bool&) { LChar* characters; if (!length) return emptyString(); String result = String::createUninitialized(length, characters); const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); const uint8_t* end = reinterpret_cast<const uint8_t*>(bytes + length); const uint8_t* alignedEnd = alignToMachineWord(end); LChar* destination = characters; while (source < end) { if (isASCII(*source)) { // Fast path for ASCII. Most Latin-1 text will be ASCII. if (isAlignedToMachineWord(source)) { while (source < alignedEnd) { MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); if (!isAllASCII<LChar>(chunk)) goto useLookupTable; copyASCIIMachineWord(destination, source); source += sizeof(MachineWord); destination += sizeof(MachineWord); } if (source == end) break; } *destination = *source; } else { useLookupTable: if (table[*source] > 0xff) goto upConvertTo16Bit; *destination = table[*source]; } ++source; ++destination; } return result; upConvertTo16Bit: UChar* characters16; String result16 = String::createUninitialized(length, characters16); UChar* destination16 = characters16; // Zero extend and copy already processed 8 bit data LChar* ptr8 = characters; LChar* endPtr8 = destination; while (ptr8 < endPtr8) *destination16++ = *ptr8++; // Handle the character that triggered the 16 bit path *destination16 = table[*source]; ++source; ++destination16; while (source < end) { if (isASCII(*source)) { // Fast path for ASCII. Most Latin-1 text will be ASCII. if (isAlignedToMachineWord(source)) { while (source < alignedEnd) { MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); if (!isAllASCII<LChar>(chunk)) goto useLookupTable16; copyASCIIMachineWord(destination16, source); source += sizeof(MachineWord); destination16 += sizeof(MachineWord); } if (source == end) break; } *destination16 = *source; } else { useLookupTable16: *destination16 = table[*source]; } ++source; ++destination16; } return result16; }
String TextCodecUTF8::decode(const char* bytes, size_t length, bool flush, bool stopOnError, bool& sawError) { // Each input byte might turn into a character. // That includes all bytes in the partial-sequence buffer because // each byte in an invalid sequence will turn into a replacement character. StringBuffer<LChar> buffer(m_partialSequenceSize + length); const uint8_t* source = reinterpret_cast<const uint8_t*>(bytes); const uint8_t* end = source + length; const uint8_t* alignedEnd = alignToMachineWord(end); LChar* destination = buffer.characters(); do { if (m_partialSequenceSize) { // Explicitly copy destination and source pointers to avoid taking pointers to the // local variables, which may harm code generation by disabling some optimizations // in some compilers. LChar* destinationForHandlePartialSequence = destination; const uint8_t* sourceForHandlePartialSequence = source; if (handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError)) { source = sourceForHandlePartialSequence; goto upConvertTo16Bit; } destination = destinationForHandlePartialSequence; source = sourceForHandlePartialSequence; if (m_partialSequenceSize) break; } while (source < end) { if (isASCII(*source)) { // Fast path for ASCII. Most UTF-8 text will be ASCII. if (isAlignedToMachineWord(source)) { while (source < alignedEnd) { MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); if (!isAllASCII<LChar>(chunk)) break; copyASCIIMachineWord(destination, source); source += sizeof(MachineWord); destination += sizeof(MachineWord); } if (source == end) break; if (!isASCII(*source)) continue; } *destination++ = *source++; continue; } int count = nonASCIISequenceLength(*source); int character; if (!count) character = nonCharacter; else { if (count > end - source) { ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); ASSERT(!m_partialSequenceSize); m_partialSequenceSize = end - source; memcpy(m_partialSequence, source, m_partialSequenceSize); source = end; break; } character = decodeNonASCIISequence(source, count); } if (character == nonCharacter) { sawError = true; if (stopOnError) break; goto upConvertTo16Bit; } if (character > 0xff) goto upConvertTo16Bit; source += count; *destination++ = character; } } while (flush && m_partialSequenceSize); buffer.shrink(destination - buffer.characters()); return String::adopt(buffer); upConvertTo16Bit: StringBuffer<UChar> buffer16(m_partialSequenceSize + length); UChar* destination16 = buffer16.characters(); // Copy the already converted characters for (LChar* converted8 = buffer.characters(); converted8 < destination;) *destination16++ = *converted8++; do { if (m_partialSequenceSize) { // Explicitly copy destination and source pointers to avoid taking pointers to the // local variables, which may harm code generation by disabling some optimizations // in some compilers. UChar* destinationForHandlePartialSequence = destination16; const uint8_t* sourceForHandlePartialSequence = source; handlePartialSequence(destinationForHandlePartialSequence, sourceForHandlePartialSequence, end, flush, stopOnError, sawError); destination16 = destinationForHandlePartialSequence; source = sourceForHandlePartialSequence; if (m_partialSequenceSize) break; } while (source < end) { if (isASCII(*source)) { // Fast path for ASCII. Most UTF-8 text will be ASCII. if (isAlignedToMachineWord(source)) { while (source < alignedEnd) { MachineWord chunk = *reinterpret_cast_ptr<const MachineWord*>(source); if (!isAllASCII<LChar>(chunk)) break; copyASCIIMachineWord(destination16, source); source += sizeof(MachineWord); destination16 += sizeof(MachineWord); } if (source == end) break; if (!isASCII(*source)) continue; } *destination16++ = *source++; continue; } int count = nonASCIISequenceLength(*source); int character; if (!count) character = nonCharacter; else { if (count > end - source) { ASSERT_WITH_SECURITY_IMPLICATION(end - source < static_cast<ptrdiff_t>(sizeof(m_partialSequence))); ASSERT(!m_partialSequenceSize); m_partialSequenceSize = end - source; memcpy(m_partialSequence, source, m_partialSequenceSize); source = end; break; } character = decodeNonASCIISequence(source, count); } if (character == nonCharacter) { sawError = true; if (stopOnError) break; // Each error generates a replacement character and consumes one byte. *destination16++ = replacementCharacter; ++source; continue; } source += count; destination16 = appendCharacter(destination16, character); } } while (flush && m_partialSequenceSize); buffer16.shrink(destination16 - buffer16.characters()); return String::adopt(buffer16); }
bool UString::isCntrl(uint32 c) { return isASCII(c) && std::iscntrl(c); }
static inline bool isIdentPart(int c) { return isASCII(c) ? isASCIIAlphanumeric(c) || c == '$' || c == '_' : isNonASCIIIdentPart(c); }
bool UString::isDigit(uint32 c) { return isASCII(c) && std::isdigit(c); }
bool UString::isAlpha(uint32 c) { return isASCII(c) && std::isalpha(c); }
void UTS46Test::TestSomeCases() { IcuTestErrorCode errorCode(*this, "TestSomeCases"); char buffer[400], buffer2[400]; int32_t i; for(i=0; i<UPRV_LENGTHOF(testCases); ++i) { const TestCase &testCase=testCases[i]; UnicodeString input(ctou(testCase.s)); UnicodeString expected(ctou(testCase.u)); // ToASCII/ToUnicode, transitional/nontransitional UnicodeString aT, uT, aN, uN; IDNAInfo aTInfo, uTInfo, aNInfo, uNInfo; trans->nameToASCII(input, aT, aTInfo, errorCode); trans->nameToUnicode(input, uT, uTInfo, errorCode); nontrans->nameToASCII(input, aN, aNInfo, errorCode); nontrans->nameToUnicode(input, uN, uNInfo, errorCode); if(errorCode.logIfFailureAndReset("first-level processing [%d/%s] %s", (int)i, testCase.o, testCase.s) ) { continue; } // ToUnicode does not set length-overflow errors. uint32_t uniErrors=testCase.errors&~ (UIDNA_ERROR_LABEL_TOO_LONG| UIDNA_ERROR_DOMAIN_NAME_TOO_LONG); char mode=testCase.o[0]; if(mode=='B' || mode=='N') { if(uNInfo.getErrors()!=uniErrors) { errln("N.nameToUnicode([%d] %s) unexpected errors %04lx", (int)i, testCase.s, (long)uNInfo.getErrors()); continue; } if(uN!=expected) { prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); errln("N.nameToUnicode([%d] %s) unexpected string %s", (int)i, testCase.s, buffer); continue; } if(aNInfo.getErrors()!=testCase.errors) { errln("N.nameToASCII([%d] %s) unexpected errors %04lx", (int)i, testCase.s, (long)aNInfo.getErrors()); continue; } } if(mode=='B' || mode=='T') { if(uTInfo.getErrors()!=uniErrors) { errln("T.nameToUnicode([%d] %s) unexpected errors %04lx", (int)i, testCase.s, (long)uTInfo.getErrors()); continue; } if(uT!=expected) { prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); errln("T.nameToUnicode([%d] %s) unexpected string %s", (int)i, testCase.s, buffer); continue; } if(aTInfo.getErrors()!=testCase.errors) { errln("T.nameToASCII([%d] %s) unexpected errors %04lx", (int)i, testCase.s, (long)aTInfo.getErrors()); continue; } } // ToASCII is all-ASCII if no severe errors if((aNInfo.getErrors()&severeErrors)==0 && !isASCII(aN)) { prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); errln("N.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s", (int)i, testCase.s, aNInfo.getErrors(), buffer); continue; } if((aTInfo.getErrors()&severeErrors)==0 && !isASCII(aT)) { prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); errln("T.nameToASCII([%d] %s) (errors %04lx) result is not ASCII %s", (int)i, testCase.s, aTInfo.getErrors(), buffer); continue; } if(verbose) { char m= mode=='B' ? mode : 'N'; prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); logln("%c.nameToASCII([%d] %s) (errors %04lx) result string: %s", m, (int)i, testCase.s, aNInfo.getErrors(), buffer); if(mode!='B') { prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); logln("T.nameToASCII([%d] %s) (errors %04lx) result string: %s", (int)i, testCase.s, aTInfo.getErrors(), buffer); } } // second-level processing UnicodeString aTuN, uTaN, aNuN, uNaN; IDNAInfo aTuNInfo, uTaNInfo, aNuNInfo, uNaNInfo; nontrans->nameToUnicode(aT, aTuN, aTuNInfo, errorCode); nontrans->nameToASCII(uT, uTaN, uTaNInfo, errorCode); nontrans->nameToUnicode(aN, aNuN, aNuNInfo, errorCode); nontrans->nameToASCII(uN, uNaN, uNaNInfo, errorCode); if(errorCode.logIfFailureAndReset("second-level processing [%d/%s] %s", (int)i, testCase.o, testCase.s) ) { continue; } if(aN!=uNaN) { prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(uNaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("N.nameToASCII([%d] %s)!=N.nameToUnicode().N.nameToASCII() " "(errors %04lx) %s vs. %s", (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2); continue; } if(aT!=uTaN) { prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(uTaN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("T.nameToASCII([%d] %s)!=T.nameToUnicode().N.nameToASCII() " "(errors %04lx) %s vs. %s", (int)i, testCase.s, aNInfo.getErrors(), buffer, buffer2); continue; } if(uN!=aNuN) { prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(aNuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("N.nameToUnicode([%d] %s)!=N.nameToASCII().N.nameToUnicode() " "(errors %04lx) %s vs. %s", (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2); continue; } if(uT!=aTuN) { prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(aTuN).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("T.nameToUnicode([%d] %s)!=T.nameToASCII().N.nameToUnicode() " "(errors %04lx) %s vs. %s", (int)i, testCase.s, uNInfo.getErrors(), buffer, buffer2); continue; } // labelToUnicode UnicodeString aTL, uTL, aNL, uNL; IDNAInfo aTLInfo, uTLInfo, aNLInfo, uNLInfo; trans->labelToASCII(input, aTL, aTLInfo, errorCode); trans->labelToUnicode(input, uTL, uTLInfo, errorCode); nontrans->labelToASCII(input, aNL, aNLInfo, errorCode); nontrans->labelToUnicode(input, uNL, uNLInfo, errorCode); if(errorCode.logIfFailureAndReset("labelToXYZ processing [%d/%s] %s", (int)i, testCase.o, testCase.s) ) { continue; } if(aN.indexOf((UChar)0x2e)<0) { if(aN!=aNL || aNInfo.getErrors()!=aNLInfo.getErrors()) { prettify(aN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(aNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("N.nameToASCII([%d] %s)!=N.labelToASCII() " "(errors %04lx vs %04lx) %s vs. %s", (int)i, testCase.s, aNInfo.getErrors(), aNLInfo.getErrors(), buffer, buffer2); continue; } } else { if((aNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) { errln("N.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT", (int)i, testCase.s, (long)aNLInfo.getErrors()); continue; } } if(aT.indexOf((UChar)0x2e)<0) { if(aT!=aTL || aTInfo.getErrors()!=aTLInfo.getErrors()) { prettify(aT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(aTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("T.nameToASCII([%d] %s)!=T.labelToASCII() " "(errors %04lx vs %04lx) %s vs. %s", (int)i, testCase.s, aTInfo.getErrors(), aTLInfo.getErrors(), buffer, buffer2); continue; } } else { if((aTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) { errln("T.labelToASCII([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT", (int)i, testCase.s, (long)aTLInfo.getErrors()); continue; } } if(uN.indexOf((UChar)0x2e)<0) { if(uN!=uNL || uNInfo.getErrors()!=uNLInfo.getErrors()) { prettify(uN).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(uNL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("N.nameToUnicode([%d] %s)!=N.labelToUnicode() " "(errors %04lx vs %04lx) %s vs. %s", (int)i, testCase.s, uNInfo.getErrors(), uNLInfo.getErrors(), buffer, buffer2); continue; } } else { if((uNLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) { errln("N.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT", (int)i, testCase.s, (long)uNLInfo.getErrors()); continue; } } if(uT.indexOf((UChar)0x2e)<0) { if(uT!=uTL || uTInfo.getErrors()!=uTLInfo.getErrors()) { prettify(uT).extract(0, 0x7fffffff, buffer, UPRV_LENGTHOF(buffer)); prettify(uTL).extract(0, 0x7fffffff, buffer2, UPRV_LENGTHOF(buffer2)); errln("T.nameToUnicode([%d] %s)!=T.labelToUnicode() " "(errors %04lx vs %04lx) %s vs. %s", (int)i, testCase.s, uTInfo.getErrors(), uTLInfo.getErrors(), buffer, buffer2); continue; } } else { if((uTLInfo.getErrors()&UIDNA_ERROR_LABEL_HAS_DOT)==0) { errln("T.labelToUnicode([%d] %s) errors %04lx missing UIDNA_ERROR_LABEL_HAS_DOT", (int)i, testCase.s, (long)uTLInfo.getErrors()); continue; } } // Differences between transitional and nontransitional processing if(mode=='B') { if( aNInfo.isTransitionalDifferent() || aTInfo.isTransitionalDifferent() || uNInfo.isTransitionalDifferent() || uTInfo.isTransitionalDifferent() || aNLInfo.isTransitionalDifferent() || aTLInfo.isTransitionalDifferent() || uNLInfo.isTransitionalDifferent() || uTLInfo.isTransitionalDifferent() ) { errln("B.process([%d] %s) isTransitionalDifferent()", (int)i, testCase.s); continue; } if( aN!=aT || uN!=uT || aNL!=aTL || uNL!=uTL || aNInfo.getErrors()!=aTInfo.getErrors() || uNInfo.getErrors()!=uTInfo.getErrors() || aNLInfo.getErrors()!=aTLInfo.getErrors() || uNLInfo.getErrors()!=uTLInfo.getErrors() ) { errln("N.process([%d] %s) vs. T.process() different errors or result strings", (int)i, testCase.s); continue; } } else { if( !aNInfo.isTransitionalDifferent() || !aTInfo.isTransitionalDifferent() || !uNInfo.isTransitionalDifferent() || !uTInfo.isTransitionalDifferent() || !aNLInfo.isTransitionalDifferent() || !aTLInfo.isTransitionalDifferent() || !uNLInfo.isTransitionalDifferent() || !uTLInfo.isTransitionalDifferent() ) { errln("%s.process([%d] %s) !isTransitionalDifferent()", testCase.o, (int)i, testCase.s); continue; } if(aN==aT || uN==uT || aNL==aTL || uNL==uTL) { errln("N.process([%d] %s) vs. T.process() same result strings", (int)i, testCase.s); continue; } } // UTF-8 std::string input8, aT8, uT8, aN8, uN8; StringByteSink<std::string> aT8Sink(&aT8), uT8Sink(&uT8), aN8Sink(&aN8), uN8Sink(&uN8); IDNAInfo aT8Info, uT8Info, aN8Info, uN8Info; input.toUTF8String(input8); trans->nameToASCII_UTF8(input8, aT8Sink, aT8Info, errorCode); trans->nameToUnicodeUTF8(input8, uT8Sink, uT8Info, errorCode); nontrans->nameToASCII_UTF8(input8, aN8Sink, aN8Info, errorCode); nontrans->nameToUnicodeUTF8(input8, uN8Sink, uN8Info, errorCode); if(errorCode.logIfFailureAndReset("UTF-8 processing [%d/%s] %s", (int)i, testCase.o, testCase.s) ) { continue; } UnicodeString aT16(UnicodeString::fromUTF8(aT8)); UnicodeString uT16(UnicodeString::fromUTF8(uT8)); UnicodeString aN16(UnicodeString::fromUTF8(aN8)); UnicodeString uN16(UnicodeString::fromUTF8(uN8)); if( aN8Info.getErrors()!=aNInfo.getErrors() || uN8Info.getErrors()!=uNInfo.getErrors() ) { errln("N.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx", (int)i, testCase.s, (long)aN8Info.getErrors(), (long)aNInfo.getErrors()); continue; } if( aT8Info.getErrors()!=aTInfo.getErrors() || uT8Info.getErrors()!=uTInfo.getErrors() ) { errln("T.xyzUTF8([%d] %s) vs. UTF-16 processing different errors %04lx vs. %04lx", (int)i, testCase.s, (long)aT8Info.getErrors(), (long)aTInfo.getErrors()); continue; } if(aT16!=aT || uT16!=uT || aN16!=aN || uN16!=uN) { errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different string results", testCase.o, (int)i, testCase.s, (long)aTInfo.getErrors()); continue; } if( aT8Info.isTransitionalDifferent()!=aTInfo.isTransitionalDifferent() || uT8Info.isTransitionalDifferent()!=uTInfo.isTransitionalDifferent() || aN8Info.isTransitionalDifferent()!=aNInfo.isTransitionalDifferent() || uN8Info.isTransitionalDifferent()!=uNInfo.isTransitionalDifferent() ) { errln("%s.xyzUTF8([%d] %s) vs. UTF-16 processing different isTransitionalDifferent()", testCase.o, (int)i, testCase.s); continue; } } }
bool UString::isAlNum(uint32 c) { return isASCII(c) && std::isalnum(c); }