Unicode Unicode_FoldCase(ConstUnicode str) // IN { Unicode folded; utf16_t *utf16; utf16_t *utf16Current; ASSERT(str); utf16 = Unicode_GetAllocBytes(str, STRING_ENCODING_UTF16); utf16Current = utf16; while (*utf16Current) { *utf16Current = UnicodeSimpleCaseFold(*utf16Current); utf16Current++; } folded = Unicode_AllocWithUTF16(utf16); free(utf16); return folded; }
int Unicode_CompareRange(ConstUnicode str1, // IN UnicodeIndex str1Start, // IN UnicodeIndex str1Length, // IN ConstUnicode str2, // IN UnicodeIndex str2Start, // IN UnicodeIndex str2Length, // IN Bool ignoreCase) // IN { int result = -1; Unicode substr1 = NULL; Unicode substr2 = NULL; utf16_t *substr1UTF16 = NULL; utf16_t *substr2UTF16 = NULL; UnicodeIndex i = 0; UnicodeIndex utf16Index; utf16_t codeUnit1; utf16_t codeUnit2; uint32 codePoint1; uint32 codePoint2; UnicodePinIndices(str1, &str1Start, &str1Length); UnicodePinIndices(str2, &str2Start, &str2Length); /* * TODO: Allocating substrings is a performance hit. We should do * this search in-place. (However, searching UTF-8 requires tender loving * care, and it's just easier to search UTF-16.) */ substr1 = Unicode_Substr(str1, str1Start, str1Length); if (!substr1) { goto out; } substr2 = Unicode_Substr(str2, str2Start, str2Length); if (!substr2) { goto out; } /* * XXX TODO: Need to normalize the incoming strings to NFC or NFD. */ substr1UTF16 = Unicode_GetAllocUTF16(substr1); if (!substr1UTF16) { goto out; } substr2UTF16 = Unicode_GetAllocUTF16(substr2); if (!substr2UTF16) { goto out; } /* * TODO: This is the naive string search algorithm, which is * O(n * m). We can do better with KMP or Boyer-Moore if this * proves to be a bottleneck. */ while (TRUE) { codeUnit1 = *(substr1UTF16 + i); codeUnit2 = *(substr2UTF16 + i); /* * TODO: Simple case folding doesn't handle the situation where * more than one code unit is needed to store the result of the * case folding. * * This means that German "straBe" (where B = sharp S, U+00DF) * will not match "STRASSE", even though the two strings are the * same. */ if (ignoreCase) { codeUnit1 = UnicodeSimpleCaseFold(codeUnit1); codeUnit2 = UnicodeSimpleCaseFold(codeUnit2); } if (codeUnit1 != codeUnit2) { break; } if (codeUnit1 == 0) { // End of both strings reached: strings are equal. result = 0; goto out; } i++; } /* * The two UTF-16 code units differ. If they're the first code unit * of a surrogate pair (for Unicode values past U+FFFF), decode the * surrogate pair into a full Unicode code point. */ if (U16_IS_SURROGATE(codeUnit1)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr1UTF16); // U16_NEXT modifies the index, so let it work on a copy. utf16Index = i; // Decode the surrogate if needed. U16_NEXT(substr1UTF16, utf16Index, substrUTF16Len, codePoint1); } else { // Not a surrogate? Then the code point value is the code unit. codePoint1 = codeUnit1; } if (U16_IS_SURROGATE(codeUnit2)) { ssize_t substrUTF16Len = Unicode_UTF16Strlen(substr2UTF16); utf16Index = i; U16_NEXT(substr2UTF16, utf16Index, substrUTF16Len, codePoint2); } else { codePoint2 = codeUnit2; } if (codePoint1 < codePoint2) { result = -1; } else if (codePoint1 > codePoint2) { result = 1; } else { // If we hit the end of the string, we've already gone to 'out'. NOT_REACHED(); } out: free(substr1UTF16); free(substr2UTF16); Unicode_Free(substr1); Unicode_Free(substr2); return result; }