RTDECL(char *) RTStrToUpper(char *psz) { /* * Loop the code points in the string, converting them one by one. * * ASSUMES that the folded code points have an encoding that is equal or * shorter than the original (this is presently correct). */ const char *pszSrc = psz; char *pszDst = psz; RTUNICP uc; do { int rc = RTStrGetCpEx(&pszSrc, &uc); if (RT_SUCCESS(rc)) { RTUNICP uc2 = RTUniCpToUpper(uc); if (RT_LIKELY( uc2 == uc || RTUniCpCalcUtf8Len(uc2) == RTUniCpCalcUtf8Len(uc))) pszDst = RTStrPutCp(pszDst, uc2); else pszDst = RTStrPutCp(pszDst, uc); } else { /* bad encoding, just copy it quietly (uc == RTUNICP_INVALID (!= 0)). */ AssertRC(rc); *pszDst++ = pszSrc[-1]; } Assert((uintptr_t)pszDst <= (uintptr_t)pszSrc); } while (uc != 0); return psz; }
RTCString &RTCString::appendCodePoint(RTUNICP uc) { /* * Single byte encoding. */ if (uc < 0x80) return RTCString::append((char)uc); /* * Multibyte encoding. * Assume max encoding length when resizing the string, that's simpler. */ AssertReturn(uc <= UINT32_C(0x7fffffff), *this); if (m_cch + 6 >= m_cbAllocated) { reserve(RT_ALIGN_Z(m_cch + 6 + 1, IPRT_MINISTRING_APPEND_ALIGNMENT)); // calls realloc(cbBoth) and sets m_cbAllocated; may throw bad_alloc. #ifndef RT_EXCEPTIONS_ENABLED AssertRelease(capacity() > m_cch + 6); #endif } char *pszNext = RTStrPutCp(&m_psz[m_cch], uc); m_cch = pszNext - m_psz; *pszNext = '\0'; return *this; }
static void test2(RTTEST hTest) { RTTestSub(hTest, "UTF-8 upper/lower encoding assumption"); #define CHECK_EQUAL(str1, str2) \ do \ { \ RTTESTI_CHECK(strlen((str1).c_str()) == (str1).length()); \ RTTESTI_CHECK((str1).length() == (str2).length()); \ RTTESTI_CHECK(mymemcmp((str1).c_str(), (str2).c_str(), (str2).length() + 1) == 0); \ } while (0) RTCString strTmp, strExpect; char szDst[16]; /* Some simple ascii stuff. */ strTmp = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strExpect = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strTmp.toUpper(); CHECK_EQUAL(strTmp, strExpect); strTmp.toLower(); strExpect = "abcdefghijklmnopqrstuvwxyz0123456abcdefghijklmnopqrstuvwxyz;-+/\\"; CHECK_EQUAL(strTmp, strExpect); strTmp = "abcdefghijklmnopqrstuvwxyz0123456ABCDEFGHIJKLMNOPQRSTUVWXYZ;-+/\\"; strTmp.toLower(); CHECK_EQUAL(strTmp, strExpect); /* Collect all upper and lower case code points. */ RTCString strLower(""); strLower.reserve(_4M); RTCString strUpper(""); strUpper.reserve(_4M); for (RTUNICP uc = 1; uc <= 0x10fffd; uc++) { /* Unicode 4.01, I think, introduced a few codepoints with lower/upper mappings that aren't up for roundtrips and which case folding has a different UTF-8 length. We'll just skip them here as there are very few: - Dotless small i and dotless capital I folds into ASCII I and i. - The small letter long s folds to ASCII S. - Greek prosgegrammeni folds to iota, which is a letter with both upper and lower case foldings of its own. */ if (uc == 0x131 || uc == 0x130 || uc == 0x17f || 0x1fbe) continue; if (RTUniCpIsLower(uc)) { RTTESTI_CHECK_MSG(uc < 0xd800 || (uc > 0xdfff && uc != 0xfffe && uc != 0xffff), ("%#x\n", uc)); strLower.appendCodePoint(uc); } if (RTUniCpIsUpper(uc)) { RTTESTI_CHECK_MSG(uc < 0xd800 || (uc > 0xdfff && uc != 0xfffe && uc != 0xffff), ("%#x\n", uc)); strUpper.appendCodePoint(uc); } } RTTESTI_CHECK(strlen(strLower.c_str()) == strLower.length()); RTTESTI_CHECK(strlen(strUpper.c_str()) == strUpper.length()); /* Fold each code point in the lower case string and check that it encodes into the same or less number of bytes. */ size_t cch = 0; const char *pszCur = strLower.c_str(); RTCString strUpper2(""); strUpper2.reserve(strLower.length() + 64); for (;;) { RTUNICP ucLower; const char * const pszPrev = pszCur; RTTESTI_CHECK_RC_BREAK(RTStrGetCpEx(&pszCur, &ucLower), VINF_SUCCESS); size_t const cchSrc = pszCur - pszPrev; if (!ucLower) break; RTUNICP const ucUpper = RTUniCpToUpper(ucLower); const char *pszDstEnd = RTStrPutCp(szDst, ucUpper); size_t const cchDst = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchSrc >= cchDst, ("ucLower=%#x %u bytes; ucUpper=%#x %u bytes\n", ucLower, cchSrc, ucUpper, cchDst)); cch += cchDst; strUpper2.appendCodePoint(ucUpper); /* roundtrip stability */ RTUNICP const ucUpper2 = RTUniCpToUpper(ucUpper); RTTESTI_CHECK_MSG(ucUpper2 == ucUpper, ("ucUpper2=%#x ucUpper=%#x\n", ucUpper2, ucUpper)); RTUNICP const ucLower2 = RTUniCpToLower(ucUpper); RTTESTI_CHECK_MSG(ucLower2 == ucLower, ("ucLower2=%#x ucLower=%#x\n", ucLower2, ucLower)); RTUNICP const ucUpper3 = RTUniCpToUpper(ucLower2); RTTESTI_CHECK_MSG(ucUpper3 == ucUpper, ("ucUpper3=%#x ucUpper=%#x\n", ucUpper3, ucUpper)); pszDstEnd = RTStrPutCp(szDst, ucLower2); size_t const cchLower2 = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchDst == cchLower2, ("ucLower2=%#x %u bytes; ucUpper=%#x %u bytes; ucLower=%#x\n", ucLower2, cchLower2, ucUpper, cchDst, ucLower)); } RTTESTI_CHECK(strlen(strUpper2.c_str()) == strUpper2.length()); RTTESTI_CHECK_MSG(cch == strUpper2.length(), ("cch=%u length()=%u\n", cch, strUpper2.length())); /* the toUpper method shall do the same thing. */ strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); /* Ditto for the upper case string. */ cch = 0; pszCur = strUpper.c_str(); RTCString strLower2(""); strLower2.reserve(strUpper.length() + 64); for (;;) { RTUNICP ucUpper; const char * const pszPrev = pszCur; RTTESTI_CHECK_RC_BREAK(RTStrGetCpEx(&pszCur, &ucUpper), VINF_SUCCESS); size_t const cchSrc = pszCur - pszPrev; if (!ucUpper) break; RTUNICP const ucLower = RTUniCpToLower(ucUpper); const char *pszDstEnd = RTStrPutCp(szDst, ucLower); size_t const cchDst = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchSrc >= cchDst, ("ucUpper=%#x %u bytes; ucLower=%#x %u bytes\n", ucUpper, cchSrc, ucLower, cchDst)); cch += cchDst; strLower2.appendCodePoint(ucLower); /* roundtrip stability */ RTUNICP const ucLower2 = RTUniCpToLower(ucLower); RTTESTI_CHECK_MSG(ucLower2 == ucLower, ("ucLower2=%#x ucLower=%#x\n", ucLower2, ucLower)); RTUNICP const ucUpper2 = RTUniCpToUpper(ucLower); RTTESTI_CHECK_MSG(ucUpper2 == ucUpper, ("ucUpper2=%#x ucUpper=%#x\n", ucUpper2, ucUpper)); RTUNICP const ucLower3 = RTUniCpToLower(ucUpper2); RTTESTI_CHECK_MSG(ucLower3 == ucLower, ("ucLower3=%#x ucLower=%#x\n", ucLower3, ucLower)); pszDstEnd = RTStrPutCp(szDst, ucUpper2); size_t const cchUpper2 = pszDstEnd - &szDst[0]; RTTESTI_CHECK_MSG(cchDst == cchUpper2, ("ucUpper2=%#x %u bytes; ucLower=%#x %u bytes\n", ucUpper2, cchUpper2, ucLower, cchDst)); } RTTESTI_CHECK(strlen(strLower2.c_str()) == strLower2.length()); RTTESTI_CHECK_MSG(cch == strLower2.length(), ("cch=%u length()=%u\n", cch, strLower2.length())); strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); /* Checks of folding stability when nothing shall change. */ strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper); strTmp = strUpper2; CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower); strTmp = strLower2; CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); /* Check folding stability for roundtrips. */ strTmp = strUpper; CHECK_EQUAL(strTmp, strUpper); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toUpper(); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp.toUpper(); strTmp.toLower(); CHECK_EQUAL(strTmp, strLower2); strTmp = strLower; CHECK_EQUAL(strTmp, strLower); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toLower(); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); strTmp.toLower(); strTmp.toUpper(); CHECK_EQUAL(strTmp, strUpper2); }
RTDECL(int) RTGetOptArgvFromString(char ***ppapszArgv, int *pcArgs, const char *pszCmdLine, const char *pszSeparators) { /* * Some input validation. */ AssertPtr(pszCmdLine); AssertPtr(pcArgs); AssertPtr(ppapszArgv); if (!pszSeparators) pszSeparators = " \t\n\r"; else AssertPtr(pszSeparators); size_t const cchSeparators = strlen(pszSeparators); AssertReturn(cchSeparators > 0, VERR_INVALID_PARAMETER); /* * Parse the command line and chop off it into argv individual argv strings. */ int rc = VINF_SUCCESS; const char *pszSrc = pszCmdLine; char *pszDup = (char *)RTMemAlloc(strlen(pszSrc) + 1); char *pszDst = pszDup; if (!pszDup) return VERR_NO_STR_MEMORY; char **papszArgs = NULL; unsigned iArg = 0; while (*pszSrc) { /* Skip stuff */ rc = rtGetOptSkipDelimiters(&pszSrc, pszSeparators, cchSeparators); if (RT_FAILURE(rc)) break; if (!*pszSrc) break; /* Start a new entry. */ if ((iArg % 32) == 0) { void *pvNew = RTMemRealloc(papszArgs, (iArg + 33) * sizeof(char *)); if (!pvNew) { rc = VERR_NO_MEMORY; break; } papszArgs = (char **)pvNew; } papszArgs[iArg++] = pszDst; /* Parse and copy the string over. */ RTUNICP CpQuote = 0; RTUNICP Cp; for (;;) { rc = RTStrGetCpEx(&pszSrc, &Cp); if (RT_FAILURE(rc) || !Cp) break; if (!CpQuote) { if (Cp == '"' || Cp == '\'') CpQuote = Cp; else if (rtGetOptIsCpInSet(Cp, pszSeparators, cchSeparators)) break; else pszDst = RTStrPutCp(pszDst, Cp); } else if (CpQuote != Cp) pszDst = RTStrPutCp(pszDst, Cp); else CpQuote = 0; } *pszDst++ = '\0'; if (RT_FAILURE(rc) || !Cp) break; } if (RT_FAILURE(rc)) { RTMemFree(pszDup); RTMemFree(papszArgs); return rc; } /* * Terminate the array. * Check for empty string to make sure we've got an array. */ if (iArg == 0) { RTMemFree(pszDup); papszArgs = (char **)RTMemAlloc(1 * sizeof(char *)); if (!papszArgs) return VERR_NO_MEMORY; } papszArgs[iArg] = NULL; *pcArgs = iArg; *ppapszArgv = papszArgs; return VINF_SUCCESS; }
RTDECL(int) RTGetOptArgvFromString(char ***ppapszArgv, int *pcArgs, const char *pszCmdLine, uint32_t fFlags, const char *pszSeparators) { /* * Some input validation. */ AssertPtr(pszCmdLine); AssertPtr(pcArgs); AssertPtr(ppapszArgv); AssertReturn( fFlags == RTGETOPTARGV_CNV_QUOTE_BOURNE_SH || fFlags == RTGETOPTARGV_CNV_QUOTE_MS_CRT, VERR_INVALID_FLAGS); if (!pszSeparators) pszSeparators = " \t\n\r"; else AssertPtr(pszSeparators); size_t const cchSeparators = strlen(pszSeparators); AssertReturn(cchSeparators > 0, VERR_INVALID_PARAMETER); /* * Parse the command line and chop off it into argv individual argv strings. */ int rc = VINF_SUCCESS; const char *pszSrc = pszCmdLine; char *pszDup = (char *)RTMemAlloc(strlen(pszSrc) + 1); char *pszDst = pszDup; if (!pszDup) return VERR_NO_STR_MEMORY; char **papszArgs = NULL; unsigned iArg = 0; while (*pszSrc) { /* Skip stuff */ rc = rtGetOptSkipDelimiters(&pszSrc, pszSeparators, cchSeparators); if (RT_FAILURE(rc)) break; if (!*pszSrc) break; /* Start a new entry. */ if ((iArg % 32) == 0) { void *pvNew = RTMemRealloc(papszArgs, (iArg + 33) * sizeof(char *)); if (!pvNew) { rc = VERR_NO_MEMORY; break; } papszArgs = (char **)pvNew; } papszArgs[iArg++] = pszDst; /* * Parse and copy the string over. */ RTUNICP Cp; if ((fFlags & RTGETOPTARGV_CNV_QUOTE_MASK) == RTGETOPTARGV_CNV_QUOTE_BOURNE_SH) { /* * Bourne shell style. */ RTUNICP CpQuote = 0; for (;;) { rc = RTStrGetCpEx(&pszSrc, &Cp); if (RT_FAILURE(rc) || !Cp) break; if (!CpQuote) { if (Cp == '"' || Cp == '\'') CpQuote = Cp; else if (rtGetOptIsCpInSet(Cp, pszSeparators, cchSeparators)) break; else if (Cp != '\\') pszDst = RTStrPutCp(pszDst, Cp); else { /* escaped char */ rc = RTStrGetCpEx(&pszSrc, &Cp); if (RT_FAILURE(rc) || !Cp) break; pszDst = RTStrPutCp(pszDst, Cp); } } else if (CpQuote != Cp) { if (Cp != '\\' || CpQuote == '\'') pszDst = RTStrPutCp(pszDst, Cp); else { /* escaped char */ rc = RTStrGetCpEx(&pszSrc, &Cp); if (RT_FAILURE(rc) || !Cp) break; pszDst = RTStrPutCp(pszDst, Cp); } } else CpQuote = 0; } } else { /* * Microsoft CRT style. */ Assert((fFlags & RTGETOPTARGV_CNV_QUOTE_MASK) == RTGETOPTARGV_CNV_QUOTE_MS_CRT); bool fInQuote = false; for (;;) { rc = RTStrGetCpEx(&pszSrc, &Cp); if (RT_FAILURE(rc) || !Cp) break; if (Cp == '"') fInQuote = !fInQuote; else if (!fInQuote && rtGetOptIsCpInSet(Cp, pszSeparators, cchSeparators)) break; else if (Cp != '\\') pszDst = RTStrPutCp(pszDst, Cp); else { /* A backslash sequence is only relevant if followed by a double quote, then it will work like an escape char. */ size_t cQuotes = 1; while (*pszSrc == '\\') { cQuotes++; pszSrc++; } if (*pszSrc != '"') /* Not an escape sequence. */ while (cQuotes-- > 0) pszDst = RTStrPutCp(pszDst, '\\'); else { /* Escape sequence. Output half of the slashes. If odd number, output the escaped double quote . */ while (cQuotes >= 2) { pszDst = RTStrPutCp(pszDst, '\\'); cQuotes -= 2; } if (!cQuotes) fInQuote = !fInQuote; else pszDst = RTStrPutCp(pszDst, '"'); pszSrc++; } } } } *pszDst++ = '\0'; if (RT_FAILURE(rc) || !Cp) break; } if (RT_FAILURE(rc)) { RTMemFree(pszDup); RTMemFree(papszArgs); return rc; } /* * Terminate the array. * Check for empty string to make sure we've got an array. */ if (iArg == 0) { RTMemFree(pszDup); papszArgs = (char **)RTMemAlloc(1 * sizeof(char *)); if (!papszArgs) return VERR_NO_MEMORY; } papszArgs[iArg] = NULL; *pcArgs = iArg; *ppapszArgv = papszArgs; return VINF_SUCCESS; }