char * Unicode_ToTitle(const char *str, // IN const char *locale) // IN { UCaseMap *caseMap; UErrorCode status = U_ZERO_ERROR; char *utf8Dest; const char *utf8Src = (const char *)str; int32_t utf8SrcLen = strlen(utf8Src); int32_t destCapacity = utf8SrcLen + 1; int32_t destLen; char *result = NULL; // Most title-case operations don't change the length of the string. utf8Dest = (char *)Util_SafeMalloc(destCapacity); caseMap = ucasemap_open(locale, 0, &status); if (U_FAILURE(status)) { goto out; } destLen = ucasemap_utf8ToTitle(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto out; } // If we need a bigger buffer, then reallocate and retry. destCapacity = destLen + 1; utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity); status = U_ZERO_ERROR; destLen = ucasemap_utf8ToTitle(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); out: ucasemap_close(caseMap); if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) { result = utf8Dest; } else { ASSERT(U_SUCCESS(status)); ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); } return result; }
static struct UCaseMap *fts_icu_csm(void) { UErrorCode err = U_ZERO_ERROR; if (icu_csm != NULL) return icu_csm; icu_csm = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &err); if (U_FAILURE(err)) { i_fatal("LibICU ucasemap_open() failed: %s", u_errorName(err)); } return icu_csm; }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; IcuTestErrorCode errorCode(*this, "TestCasingImpl"); LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode)); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode); ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode); result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0); if(errorCode.isFailure()) { errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); errorCode.reset(); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); errln("expected \"" + output + "\" got \"" + result + "\"" ); } }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; UCaseMap *csm; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open(localeID, options, &errorCode); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. int32_t size=1; // Not 0 because that only gives preflighting. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode); ucasemap_setBreakIterator(csm, clone, &errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode); result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0); if(U_FAILURE(errorCode)) { errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); } ucasemap_close(csm); }
/* Try titlecasing with options. */ static void TestUCaseMapToTitle(void) { /* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */ /* * Note: The sentence BreakIterator does not recognize a '.' * as a sentence terminator if it is followed by lowercase. * That is why the example has the '!'. */ static const UChar beforeTitle[]= { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e }, titleWord[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e }, titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e }, titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e }; UChar buffer[32]; UCaseMap *csm; UBreakIterator *sentenceIter; const UBreakIterator *iter; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open("", 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode)); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=NULL) { log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter); } /* Use default UBreakIterator: Word breaks. */ length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWord) || 0!=u_memcmp(buffer, titleWord, length) || buffer[length]!=0 ) { log_err_status(errorCode, "ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } if (U_SUCCESS(errorCode)) { iter=ucasemap_getBreakIterator(csm); if(iter==NULL) { log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n"); } } /* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */ ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode)); return; } length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWordNoAdjust) || 0!=u_memcmp(buffer, titleWordNoAdjust, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* Set a sentence break iterator. */ errorCode=U_ZERO_ERROR; sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode)); ucasemap_close(csm); return; } ucasemap_setBreakIterator(csm, sentenceIter, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode)); ubrk_close(sentenceIter); ucasemap_close(csm); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=sentenceIter) { log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter); } ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode)); return; } /* Use the sentence break iterator with the option. Preflight first. */ length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=UPRV_LENGTHOF(titleSentNoLower) ) { log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; buffer[0]=0; length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleSentNoLower) || 0!=u_memcmp(buffer, titleSentNoLower, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */ { char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64]; int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength; errorCode=U_ZERO_ERROR; u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, UPRV_LENGTHOF(titleSentNoLower), &errorCode); length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode); if( U_FAILURE(errorCode) || length!=utf8TitleSentNoLowerLength || 0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) || utf8[length]!=0 ) { log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } } ucasemap_close(csm); }
/* * API test for UCaseMap; * test cases for actual case mappings using UCaseMap see * intltest utility/UnicodeStringTest/StringCaseTest/TestCasing */ static void TestUCaseMap(void) { static const char aBc[] ={ 0x61, 0x42, 0x63, 0 }, abc[] ={ 0x61, 0x62, 0x63, 0 }, ABCg[]={ 0x41, 0x42, 0x43, 0x67, 0 }, defg[]={ 0x64, 0x65, 0x66, 0x67, 0 }; char utf8Out[8]; UCaseMap *csm; const char *locale; uint32_t options; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open("tur", 0xa5, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucasemap_open(\"tur\") failed - %s\n", u_errorName(errorCode)); return; } locale=ucasemap_getLocale(csm); if(0!=strcmp(locale, "tr")) { log_err("ucasemap_getLocale(ucasemap_open(\"tur\"))==%s!=\"tr\"\n", locale); } /* overly long locale IDs get truncated to their language code to avoid unnecessary allocation */ ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode); locale=ucasemap_getLocale(csm); if(0!=strcmp(locale, "i-klingon")) { log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s!=\"i-klingon\"\n", locale); } errorCode=U_ZERO_ERROR; options=ucasemap_getOptions(csm); if(options!=0xa5) { log_err("ucasemap_getOptions(ucasemap_open(0xa5))==0x%lx!=0xa5\n", (long)options); } ucasemap_setOptions(csm, 0x333333, &errorCode); options=ucasemap_getOptions(csm); if(options!=0x333333) { log_err("ucasemap_getOptions(ucasemap_setOptions(0x333333))==0x%lx!=0x333333\n", (long)options); } /* test case mapping API; not all permutations necessary due to shared implementation code */ /* NUL terminated source */ errorCode=U_ZERO_ERROR; length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) { log_err("ucasemap_utf8ToLower(aBc\\0) failed\n"); } /* incoming failure code */ errorCode=U_PARSE_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(errorCode!=U_PARSE_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(failure) failed\n"); } /* overlapping input & output */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, aBc); length=ucasemap_utf8ToUpper(csm, utf8Out, 2, utf8Out+1, 2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) { log_err("ucasemap_utf8ToUpper(overlap 1) failed\n"); } /* overlap in the other direction */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, aBc); length=ucasemap_utf8ToUpper(csm, utf8Out+1, 2, utf8Out, 2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) { log_err("ucasemap_utf8ToUpper(overlap 2) failed\n"); } /* NULL destination */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, NULL, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(dest=NULL) failed\n"); } /* destCapacity<0 */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, -2, aBc, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(destCapacity<0) failed\n"); } /* NULL source */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), NULL, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(src=NULL) failed\n"); } /* srcLength<-1 */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(srcLength<-1) failed\n"); } /* buffer overflow */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToUpper(csm, utf8Out, 2, aBc, 3, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3 || 0!=strcmp(defg+2, utf8Out+2)) { log_err("ucasemap_utf8ToUpper(overflow) failed\n"); } /* dest not terminated (leaves g from defg alone) */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToUpper(csm, utf8Out, 3, aBc, 3, &errorCode); if(errorCode!=U_STRING_NOT_TERMINATED_WARNING || length!=3 || 0!=strcmp(ABCg, utf8Out)) { log_err("ucasemap_utf8ToUpper(overflow) failed\n"); } /* C API coverage for case folding. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */ errorCode=U_ZERO_ERROR; utf8Out[0]=0; length=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, 3, &errorCode); if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) { log_err("ucasemap_utf8FoldCase(aBc) failed\n"); } ucasemap_close(csm); }
char *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len) { size_t old_len; #ifdef NO_EAI /* * ASCII mode only. */ if (len < 0) len = strlen(src); if ((flags & CASEF_FLAG_APPEND) == 0) VSTRING_RESET(dest); old_len = VSTRING_LEN(dest); vstring_strncat(dest, src, len); lowercase(STR(dest) + old_len); return (STR(dest)); #else /* * Unicode mode. */ const char myname[] = "casefold"; static VSTRING *fold_buf = 0; static UCaseMap *csm = 0; UErrorCode error; ssize_t space_needed; int n; /* * Handle special cases. */ if (len < 0) len = strlen(src); if (dest == 0) dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100))); if ((flags & CASEF_FLAG_APPEND) == 0) VSTRING_RESET(dest); old_len = VSTRING_LEN(dest); /* * All-ASCII input, or ASCII mode only. */ if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) { vstring_strncat(dest, src, len); lowercase(STR(dest) + old_len); return (STR(dest)); } /* * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax * errors. XXX Based on source-code review we conclude that non-UTF-8 * bytes are copied verbatim, and experiments confirm this. Given that * this behavior is intentional, we assume that it will stay that way. */ #if 0 if (valid_utf8_string(src, len) == 0) { if (err) *err = "malformed UTF-8 or invalid codepoint"; return (0); } #endif /* * One-time initialization. With ICU 4.8 this works while chrooted. */ if (csm == 0) { error = U_ZERO_ERROR; csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error); if (U_SUCCESS(error) == 0) msg_fatal("ucasemap_open error: %s", u_errorName(error)); } /* * Fold the input, adjusting the buffer size if needed. Safety: don't * loop forever. * * Note: the requested amount of space for casemapped output (as reported * with space_needed below) does not include storage for the null * terminator. The terminator is written only when the output buffer is * large enough. This is why we overallocate space when the output does * not fit. But if the output fits exactly, then the ouput will be * unterminated, and we have to terminate the output ourselves. */ for (n = 0; n < 3; n++) { error = U_ZERO_ERROR; space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len, vstring_avail(dest), src, len, &error); if (U_SUCCESS(error)) { VSTRING_AT_OFFSET(dest, old_len + space_needed); if (vstring_avail(dest) == 0) /* exact fit, no terminator */ VSTRING_TERMINATE(dest); /* add terminator */ break; } else if (error == U_BUFFER_OVERFLOW_ERROR) { VSTRING_SPACE(dest, space_needed + 1); /* for terminator */ } else { msg_fatal("%s: conversion error for \"%s\": %s", myname, src, u_errorName(error)); } } return (STR(dest)); #endif /* NO_EAI */ }
char * Unicode_ToLower(const char *str, // IN const char *locale) // IN { UCaseMap *caseMap; UErrorCode status = U_ZERO_ERROR; char *utf8Dest; const char *utf8Src = (const char *)str; int32_t utf8SrcLen = strlen(utf8Src); int32_t destCapacity = utf8SrcLen + 1; int32_t destLen; char *result = NULL; /* * XXX TODO: This and the two following functions are substantially * identical. Refactor them! (Note that ucasemap_utf8ToTitle * takes a non-const UCaseMap, so we can't just use pointers to * functions unless we cast.) */ // Most lower-case operations don't change the length of the string. utf8Dest = (char *)Util_SafeMalloc(destCapacity); caseMap = ucasemap_open(locale, 0, &status); if (U_FAILURE(status)) { goto out; } destLen = ucasemap_utf8ToLower(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto out; } // If we need a bigger buffer, then reallocate and retry. destCapacity = destLen + 1; utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity); status = U_ZERO_ERROR; destLen = ucasemap_utf8ToLower(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); out: ucasemap_close(caseMap); if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) { result = utf8Dest; } else { ASSERT(U_SUCCESS(status)); ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); } return result; }
/** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }
/** * Convert case (upper, lowercase) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { /* retry */ buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }