char * Unicode_ToTitle(const char *str, // IN const char *locale) // IN { UCaseMap *caseMap; UErrorCode status = U_ZERO_ERROR; char *utf8Dest; const char *utf8Src = (const char *)str; int32_t utf8SrcLen = strlen(utf8Src); int32_t destCapacity = utf8SrcLen + 1; int32_t destLen; char *result = NULL; // Most title-case operations don't change the length of the string. utf8Dest = (char *)Util_SafeMalloc(destCapacity); caseMap = ucasemap_open(locale, 0, &status); if (U_FAILURE(status)) { goto out; } destLen = ucasemap_utf8ToTitle(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto out; } // If we need a bigger buffer, then reallocate and retry. destCapacity = destLen + 1; utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity); status = U_ZERO_ERROR; destLen = ucasemap_utf8ToTitle(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); out: ucasemap_close(caseMap); if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) { result = utf8Dest; } else { ASSERT(U_SUCCESS(status)); ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); } return result; }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; IcuTestErrorCode errorCode(*this, "TestCasingImpl"); LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode)); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode); ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode); result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0); if(errorCode.isFailure()) { errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); errorCode.reset(); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); errln("expected \"" + output + "\" got \"" + result + "\"" ); } }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; UCaseMap *csm; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open(localeID, options, &errorCode); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. int32_t size=1; // Not 0 because that only gives preflighting. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode); ucasemap_setBreakIterator(csm, clone, &errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode); result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0); if(U_FAILURE(errorCode)) { errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); } ucasemap_close(csm); }
/* Try titlecasing with options. */ static void TestUCaseMapToTitle(void) { /* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */ /* * Note: The sentence BreakIterator does not recognize a '.' * as a sentence terminator if it is followed by lowercase. * That is why the example has the '!'. */ static const UChar beforeTitle[]= { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e }, titleWord[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e }, titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e }, titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e }; UChar buffer[32]; UCaseMap *csm; UBreakIterator *sentenceIter; const UBreakIterator *iter; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open("", 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode)); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=NULL) { log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter); } /* Use default UBreakIterator: Word breaks. */ length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWord) || 0!=u_memcmp(buffer, titleWord, length) || buffer[length]!=0 ) { log_err_status(errorCode, "ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } if (U_SUCCESS(errorCode)) { iter=ucasemap_getBreakIterator(csm); if(iter==NULL) { log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n"); } } /* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */ ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode); if(U_FAILURE(errorCode)) { log_err_status(errorCode, "error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode)); return; } length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleWordNoAdjust) || 0!=u_memcmp(buffer, titleWordNoAdjust, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* Set a sentence break iterator. */ errorCode=U_ZERO_ERROR; sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode)); ucasemap_close(csm); return; } ucasemap_setBreakIterator(csm, sentenceIter, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode)); ubrk_close(sentenceIter); ucasemap_close(csm); return; } iter=ucasemap_getBreakIterator(csm); if(iter!=sentenceIter) { log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter); } ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode); if(U_FAILURE(errorCode)) { log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode)); return; } /* Use the sentence break iterator with the option. Preflight first. */ length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=UPRV_LENGTHOF(titleSentNoLower) ) { log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } errorCode=U_ZERO_ERROR; buffer[0]=0; length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); if( U_FAILURE(errorCode) || length!=UPRV_LENGTHOF(titleSentNoLower) || 0!=u_memcmp(buffer, titleSentNoLower, length) || buffer[length]!=0 ) { log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } /* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */ { char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64]; int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength; errorCode=U_ZERO_ERROR; u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode); u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, UPRV_LENGTHOF(titleSentNoLower), &errorCode); length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode); if( U_FAILURE(errorCode) || length!=utf8TitleSentNoLowerLength || 0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) || utf8[length]!=0 ) { log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode)); } } ucasemap_close(csm); }
/** * Convert case (TitleCase) * * * @param str character vector * @param opts_brkiter list * @return character vector * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * separated from stri_trans_casemap; * use StriUBreakIterator */ SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) { StriBrkIterOptions opts_brkiter2(opts_brkiter, "word"); PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) StriUBreakIterator brkiter(opts_brkiter2); UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) status = U_ZERO_ERROR; ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) brkiter.free(false); // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do? // now briter is owned by ucasemap. // it will be released on ucasemap_close // (checked with ICU man & src code) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }