RTDECL(PRTUTF16) RTUtf16ToLower(PRTUTF16 pwsz) { PRTUTF16 pwc = pwsz; for (;;) { RTUTF16 wc = *pwc; if (!wc) break; if (wc < 0xd800 || wc >= 0xdc00) { RTUNICP ucFolded = RTUniCpToLower(wc); if (ucFolded < 0x10000) *pwc++ = RTUniCpToLower(wc); } else { /* surrogate */ RTUTF16 wc2 = pwc[1]; if (wc2 >= 0xdc00 && wc2 <= 0xdfff) { RTUNICP uc = 0x10000 + (((wc & 0x3ff) << 10) | (wc2 & 0x3ff)); RTUNICP ucFolded = RTUniCpToLower(uc); if (uc != ucFolded && ucFolded >= 0x10000) /* we don't support shrinking the string */ { uc -= 0x10000; *pwc++ = 0xd800 | (uc >> 10); *pwc++ = 0xdc00 | (uc & 0x3ff); } } else /* invalid encoding. */
/** * Performs a case insensitive string compare between two UTF-8 strings, given a * maximum string length. * * This is a simplified compare, as only the simplified lower/upper case folding * specified by the unicode specs are used. It does not consider character pairs * as they are used in some languages, just simple upper & lower case compares. * * The result is the difference between the mismatching codepoints after they * both have been lower cased. * * If the string encoding is invalid the function will assert (strict builds) * and use RTStrCmp for the remainder of the string. * * @returns < 0 if the first string less than the second string. * @returns 0 if the first string identical to the second string. * @returns > 0 if the first string greater than the second string. * @param psz1 First UTF-8 string. Null is allowed. * @param psz2 Second UTF-8 string. Null is allowed. * @param cchMax Maximum string length */ RTDECL(int) RTStrNICmp(const char *psz1, const char *psz2, size_t cchMax) { if (cchMax == 0) return 0; if (psz1 == psz2) return 0; if (!psz1) return -1; if (!psz2) return 1; for (;;) { /* Get the codepoints */ RTUNICP uc1; size_t cchMax2 = cchMax; int rc = RTStrGetCpNEx(&psz1, &cchMax, &uc1); if (RT_FAILURE(rc)) { AssertRC(rc); psz1--; cchMax++; break; } RTUNICP uc2; rc = RTStrGetCpNEx(&psz2, &cchMax2, &uc2); if (RT_FAILURE(rc)) { AssertRC(rc); psz2--; psz1 -= (cchMax - cchMax2 + 1); /* This can't overflow, can it? */ cchMax = cchMax2 + 1; break; } /* compare */ int iDiff = uc1 - uc2; if (iDiff) { iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2); if (iDiff) { iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */ if (iDiff) return iDiff; } } /* hit the terminator? */ if (!uc1 || cchMax == 0) return 0; } /* Hit some bad encoding, continue in case insensitive mode. */ return RTStrNCmp(psz1, psz2, cchMax); }
/** * Performs a case insensitive string compare between two UTF-8 strings. * * This is a simplified compare, as only the simplified lower/upper case folding * specified by the unicode specs are used. It does not consider character pairs * as they are used in some languages, just simple upper & lower case compares. * * The result is the difference between the mismatching codepoints after they * both have been lower cased. * * If the string encoding is invalid the function will assert (strict builds) * and use RTStrCmp for the remainder of the string. * * @returns < 0 if the first string less than the second string. * @returns 0 if the first string identical to the second string. * @returns > 0 if the first string greater than the second string. * @param psz1 First UTF-8 string. Null is allowed. * @param psz2 Second UTF-8 string. Null is allowed. */ RTDECL(int) RTStrICmp(const char *psz1, const char *psz2) { if (psz1 == psz2) return 0; if (!psz1) return -1; if (!psz2) return 1; const char *pszStart1 = psz1; for (;;) { /* Get the codepoints */ RTUNICP uc1; int rc = RTStrGetCpEx(&psz1, &uc1); if (RT_FAILURE(rc)) { AssertRC(rc); psz1--; break; } RTUNICP uc2; rc = RTStrGetCpEx(&psz2, &uc2); if (RT_FAILURE(rc)) { AssertRC(rc); psz2--; psz1 = RTStrPrevCp(pszStart1, psz1); break; } /* compare */ int iDiff = uc1 - uc2; if (iDiff) { iDiff = RTUniCpToUpper(uc1) != RTUniCpToUpper(uc2); if (iDiff) { iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* lower case diff last! */ if (iDiff) return iDiff; } } /* hit the terminator? */ if (!uc1) return 0; } /* Hit some bad encoding, continue in case sensitive mode. */ return RTStrCmp(psz1, psz2); }
RTDECL(char *) RTStrToLower(char *psz) { /* * Loop the code points in the string, converting them one by one. * * ASSUMES that the folded code points have an encoding that is equal or * shorter than the original (this is presently correct). */ const char *pszSrc = psz; char *pszDst = psz; RTUNICP uc; do { int rc = RTStrGetCpEx(&pszSrc, &uc); if (RT_SUCCESS(rc)) { uc = RTUniCpToLower(uc); pszDst = RTStrPutCp(pszDst, uc); } else { /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */ AssertRC(rc); *pszDst++ = pszSrc[-1]; } Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc); } while (uc != 0); return psz; }
/** * Detects a few annoying unicode points with unstable case folding for UTF-8. * * Unicode 4.01, I think, introduces a few codepoints with lower/upper mappings * that has a different length when encoded as UTF-8. This breaks some * assumptions we used to make. Since it's just a handful codepoints, we'll * detect them and ignore them here. The actual case folding functions in * IPRT will of course deal with this in a more robust manner. * * @returns true if problematic, false if not. * @param uc The codepoints. */ static bool isUnevenUtf8FoldingCp(RTUNICP uc) { RTUNICP ucLower = RTUniCpToLower(uc); RTUNICP ucUpper = RTUniCpToUpper(uc); //return RTUniCpCalcUtf8Len(ucLower) != RTUniCpCalcUtf8Len(ucUpper); return false; }
RTDECL(char *) RTStrIStr(const char *pszHaystack, const char *pszNeedle) { /* Any NULL strings means NULL return. (In the RTStrCmp tradition.) */ if (!pszHaystack) return NULL; if (!pszNeedle) return NULL; /* The empty string matches everything. */ if (!*pszNeedle) return (char *)pszHaystack; /* * The search strategy is to pick out the first char of the needle, fold it, * and match it against the haystack code point by code point. When encountering * a matching code point we use RTStrNICmp for the remainder (if any) of the needle. */ const char * const pszNeedleStart = pszNeedle; RTUNICP Cp0; RTStrGetCpEx(&pszNeedle, &Cp0); /* pszNeedle is advanced one code point. */ size_t const cchNeedle = strlen(pszNeedle); size_t const cchNeedleCp0= pszNeedle - pszNeedleStart; RTUNICP const Cp0Lower = RTUniCpToLower(Cp0); RTUNICP const Cp0Upper = RTUniCpToUpper(Cp0); if ( Cp0Lower == Cp0Upper && Cp0Lower == Cp0) { /* Cp0 is not a case sensitive char. */ for (;;) { RTUNICP Cp; RTStrGetCpEx(&pszHaystack, &Cp); if (!Cp) break; if ( Cp == Cp0 && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle)) return (char *)pszHaystack - cchNeedleCp0; } } else if ( Cp0Lower == Cp0 || Cp0Upper != Cp0) { /* Cp0 is case sensitive */ for (;;) { RTUNICP Cp; RTStrGetCpEx(&pszHaystack, &Cp); if (!Cp) break; if ( ( Cp == Cp0Upper || Cp == Cp0Lower) && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle)) return (char *)pszHaystack - cchNeedleCp0; } } else { /* Cp0 is case sensitive and folds to two difference chars. (paranoia) */ for (;;) { RTUNICP Cp; RTStrGetCpEx(&pszHaystack, &Cp); if (!Cp) break; if ( ( Cp == Cp0 || Cp == Cp0Upper || Cp == Cp0Lower) && !RTStrNICmp(pszHaystack, pszNeedle, cchNeedle)) return (char *)pszHaystack - cchNeedleCp0; } } return NULL; }
static void test2(RTTEST hTest) { RTTestSub(hTest, "UTF-8 upper/lower encoding assumption"); #define CHECK_EQUAL(str1, str2) \ do \ { \ RTTESTI_CHECK(strlen((str1).c_str()) == (str1).length()); \ RTTESTI_CHECK((str1).length() == (str2).length()); \ RTTESTI_CHECK(mymemcmp((str1).c_str(), (str2).c_str(), (str2).length() + 1) == 0); \ } while (0) RTCString strTmp, strExpect; char szDst[16]; /* Some simple ascii stuff. */ strTmp = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strExpect = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strTmp.toUpper(); CHECK_EQUAL(strTmp, strExpect); strTmp.toLower(); strExpect = "abcdefghijklmnopqrstuvwxyz0123456abcdefghijklmnopqrstuvwxyz;-+/\\"; CHECK_EQUAL(strTmp, strExpect); strTmp = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strTmp.toLower(); CHECK_EQUAL(strTmp, strExpect); /* Collect all upper and lower case code points. */ RTCString strLower(""); strLower.reserve(_4M); RTCString strUpper(""); strUpper.reserve(_4M); for (RTUNICP uc = 1; uc <= 0x10fffd; uc++) { /* Unicode 4.01, I think, introduced a few codepoints with lower/upper mappings that aren't up for roundtrips and which case folding has a different UTF-8 length. We'll just skip them here as there are very few: - Dotless small i and dotless capital I folds into ASCII I and i. - The small letter long s folds to ASCII S. - Greek prosgegrammeni folds to iota, which is a letter with both upper and lower case foldings of its own. */ if (uc == 0x131 || uc == 0x130 || uc == 0x17f || 0x1fbe) continue; if (RTUniCpIsLower(uc)) { RTTESTI_CHECK_MSG(uc < 0xd800 || (uc > 0xdfff && uc != 0xfffe && uc != 0xffff), ("%#x\n", uc)); strLower.appendCodePoint(uc); } if (RTUniCpIsUpper(uc)) { RTTESTI_CHECK_MSG(uc < 0xd800 || (uc > 0xdfff && uc != 0xfffe && uc != 0xffff), ("%#x\n", uc)); strUpper.appendCodePoint(uc); } } RTTESTI_CHECK(strlen(strLower.c_str()) == strLower.length()); RTTESTI_CHECK(strlen(strUpper.c_str()) == strUpper.length()); /* Fold each code point in the lower case string and check that it encodes into the same or less number of bytes. */ size_t cch = 0; const char *pszCur = strLower.c_str(); RTCString strUpper2(""); strUpper2.reserve(strLower.length() + 64); for (;;) { RTUNICP ucLower; const char * const pszPrev = pszCur; RTTESTI_CHECK_RC_BREAK(RTStrGetCpEx(&pszCur, &ucLower), VINF_SUCCESS); size_t const cchSrc = pszCur - pszPrev; if (!ucLower) break; RTUNICP const ucUpper = RTUniCpToUpper(ucLower); const char *pszDstEnd = RTStrPutCp(szDst, ucUpper); size_t const cchDst = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchSrc >= cchDst, ("ucLower=%#x %u bytes; ucUpper=%#x %u bytes\n", ucLower, cchSrc, ucUpper, cchDst)); cch += cchDst; strUpper2.appendCodePoint(ucUpper); /* roundtrip stability */ RTUNICP const ucUpper2 = RTUniCpToUpper(ucUpper); RTTESTI_CHECK_MSG(ucUpper2 == ucUpper, ("ucUpper2=%#x ucUpper=%#x\n", ucUpper2, ucUpper)); RTUNICP const ucLower2 = RTUniCpToLower(ucUpper); RTTESTI_CHECK_MSG(ucLower2 == ucLower, ("ucLower2=%#x ucLower=%#x\n", ucLower2, ucLower)); RTUNICP const ucUpper3 = RTUniCpToUpper(ucLower2); RTTESTI_CHECK_MSG(ucUpper3 == ucUpper, ("ucUpper3=%#x ucUpper=%#x\n", ucUpper3, ucUpper)); pszDstEnd = RTStrPutCp(szDst, ucLower2); size_t const cchLower2 = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchDst == cchLower2, ("ucLower2=%#x %u bytes; ucUpper=%#x %u bytes; ucLower=%#x\n", ucLower2, cchLower2, ucUpper, cchDst, ucLower)); } RTTESTI_CHECK(strlen(strUpper2.c_str()) == strUpper2.length()); RTTESTI_CHECK_MSG(cch == strUpper2.length(), ("cch=%u length()=%u\n", cch, strUpper2.length())); /* the toUpper method shall do the same thing. */ strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); /* Ditto for the upper case string. */ cch = 0; pszCur = strUpper.c_str(); RTCString strLower2(""); strLower2.reserve(strUpper.length() + 64); for (;;) { RTUNICP ucUpper; const char * const pszPrev = pszCur; RTTESTI_CHECK_RC_BREAK(RTStrGetCpEx(&pszCur, &ucUpper), VINF_SUCCESS); size_t const cchSrc = pszCur - pszPrev; if (!ucUpper) break; RTUNICP const ucLower = RTUniCpToLower(ucUpper); const char *pszDstEnd = RTStrPutCp(szDst, ucLower); size_t const cchDst = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchSrc >= cchDst, ("ucUpper=%#x %u bytes; ucLower=%#x %u bytes\n", ucUpper, cchSrc, ucLower, cchDst)); cch += cchDst; strLower2.appendCodePoint(ucLower); /* roundtrip stability */ RTUNICP const ucLower2 = RTUniCpToLower(ucLower); RTTESTI_CHECK_MSG(ucLower2 == ucLower, ("ucLower2=%#x ucLower=%#x\n", ucLower2, ucLower)); RTUNICP const ucUpper2 = RTUniCpToUpper(ucLower); RTTESTI_CHECK_MSG(ucUpper2 == ucUpper, ("ucUpper2=%#x ucUpper=%#x\n", ucUpper2, ucUpper)); RTUNICP const ucLower3 = RTUniCpToLower(ucUpper2); RTTESTI_CHECK_MSG(ucLower3 == ucLower, ("ucLower3=%#x ucLower=%#x\n", ucLower3, ucLower)); pszDstEnd = RTStrPutCp(szDst, ucUpper2); size_t const cchUpper2 = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchDst == cchUpper2, ("ucUpper2=%#x %u bytes; ucLower=%#x %u bytes\n", ucUpper2, cchUpper2, ucLower, cchDst)); } RTTESTI_CHECK(strlen(strLower2.c_str()) == strLower2.length()); RTTESTI_CHECK_MSG(cch == strLower2.length(), ("cch=%u length()=%u\n", cch, strLower2.length())); strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); /* Checks of folding stability when nothing shall change. */ strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp = strUpper2; CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp = strLower2; CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); /* Check folding stability for roundtrips. */ strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toUpper(); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toUpper(); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toLower(); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toLower(); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); }
RTDECL(int) RTUtf16ICmp(register PCRTUTF16 pwsz1, register PCRTUTF16 pwsz2) { if (pwsz1 == pwsz2) return 0; if (!pwsz1) return -1; if (!pwsz2) return 1; PCRTUTF16 pwsz1Start = pwsz1; /* keep it around in case we have to backtrack on a surrogate pair */ for (;;) { register RTUTF16 wc1 = *pwsz1; register RTUTF16 wc2 = *pwsz2; register int iDiff = wc1 - wc2; if (iDiff) { /* unless they are *both* surrogate pairs, there is no chance they'll be identical. */ if ( wc1 < 0xd800 || wc2 < 0xd800 || wc1 > 0xdfff || wc2 > 0xdfff) { /* simple UCS-2 char */ iDiff = RTUniCpToUpper(wc1) - RTUniCpToUpper(wc2); if (iDiff) iDiff = RTUniCpToLower(wc1) - RTUniCpToLower(wc2); } else { /* a damned pair */ RTUNICP uc1; RTUNICP uc2; if (wc1 >= 0xdc00) { if (pwsz1Start == pwsz1) return iDiff; uc1 = pwsz1[-1]; if (uc1 < 0xd800 || uc1 >= 0xdc00) return iDiff; uc1 = 0x10000 + (((uc1 & 0x3ff) << 10) | (wc1 & 0x3ff)); uc2 = 0x10000 + (((pwsz2[-1] & 0x3ff) << 10) | (wc2 & 0x3ff)); } else { uc1 = *++pwsz1; if (uc1 < 0xdc00 || uc1 >= 0xe000) return iDiff; uc1 = 0x10000 + (((wc1 & 0x3ff) << 10) | (uc1 & 0x3ff)); uc2 = 0x10000 + (((wc2 & 0x3ff) << 10) | (*++pwsz2 & 0x3ff)); } iDiff = RTUniCpToUpper(uc1) - RTUniCpToUpper(uc2); if (iDiff) iDiff = RTUniCpToLower(uc1) - RTUniCpToLower(uc2); /* serious paranoia! */ } if (iDiff) return iDiff; } if (!wc1) return 0; pwsz1++; pwsz2++; } }