void fts_icu_lcase(string_t *dest_utf8, const char *src_utf8) { struct UCaseMap *csm = fts_icu_csm(); size_t avail_bytes, dest_pos = dest_utf8->used; char *dest_data; int dest_full_len; UErrorCode err = U_ZERO_ERROR; avail_bytes = buffer_get_writable_size(dest_utf8) - dest_pos; dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, avail_bytes); dest_full_len = ucasemap_utf8ToLower(csm, dest_data, avail_bytes, src_utf8, -1, &err); if (err == U_BUFFER_OVERFLOW_ERROR) { err = U_ZERO_ERROR; dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, dest_full_len); dest_full_len = ucasemap_utf8ToLower(csm, dest_data, dest_full_len, src_utf8, -1, &err); i_assert(err != U_BUFFER_OVERFLOW_ERROR); } if (U_FAILURE(err)) { i_fatal("LibICU ucasemap_utf8ToLower() failed: %s", u_errorName(err)); } buffer_set_used_size(dest_utf8, dest_full_len); }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; IcuTestErrorCode errorCode(*this, "TestCasingImpl"); LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode)); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode); ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(), utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode); result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0); if(errorCode.isFailure()) { errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); errorCode.reset(); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); errln("expected \"" + output + "\" got \"" + result + "\"" ); } }
void StringCaseTest::TestCasingImpl(const UnicodeString &input, const UnicodeString &output, int32_t whichCase, void *iter, const char *localeID, uint32_t options) { // UnicodeString UnicodeString result; const char *name; Locale locale(localeID); result=input; switch(whichCase) { case TEST_LOWER: name="toLower"; result.toLower(locale); break; case TEST_UPPER: name="toUpper"; result.toUpper(locale); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="toTitle"; result.toTitle((BreakIterator *)iter, locale, options); break; #endif case TEST_FOLD: name="foldCase"; result.foldCase(options); break; default: name=""; break; // won't happen } if(result!=output) { errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name); } #if !UCONFIG_NO_BREAK_ITERATION if(whichCase==TEST_TITLE && options==0) { result=input; result.toTitle((BreakIterator *)iter, locale); if(result!=output) { errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res"); } } #endif // UTF-8 char utf8In[100], utf8Out[100]; int32_t utf8InLength, utf8OutLength, resultLength; UChar *buffer; UCaseMap *csm; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open(localeID, options, &errorCode); #if !UCONFIG_NO_BREAK_ITERATION if(iter!=NULL) { // Clone the break iterator so that the UCaseMap can safely adopt it. int32_t size=1; // Not 0 because that only gives preflighting. UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode); ucasemap_setBreakIterator(csm, clone, &errorCode); } #endif u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode); switch(whichCase) { case TEST_LOWER: name="ucasemap_utf8ToLower"; utf8OutLength=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; case TEST_UPPER: name="ucasemap_utf8ToUpper"; utf8OutLength=ucasemap_utf8ToUpper(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #if !UCONFIG_NO_BREAK_ITERATION case TEST_TITLE: name="ucasemap_utf8ToTitle"; utf8OutLength=ucasemap_utf8ToTitle(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; #endif case TEST_FOLD: name="ucasemap_utf8FoldCase"; utf8OutLength=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), utf8In, utf8InLength, &errorCode); break; default: name=""; utf8OutLength=0; break; // won't happen } buffer=result.getBuffer(utf8OutLength); u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode); result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0); if(U_FAILURE(errorCode)) { errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode)); } else if(result!=output) { errln("error: %s() got a wrong result for a test case from casing.res", name); } ucasemap_close(csm); }
/* * API test for UCaseMap; * test cases for actual case mappings using UCaseMap see * intltest utility/UnicodeStringTest/StringCaseTest/TestCasing */ static void TestUCaseMap(void) { static const char aBc[] ={ 0x61, 0x42, 0x63, 0 }, abc[] ={ 0x61, 0x62, 0x63, 0 }, ABCg[]={ 0x41, 0x42, 0x43, 0x67, 0 }, defg[]={ 0x64, 0x65, 0x66, 0x67, 0 }; char utf8Out[8]; UCaseMap *csm; const char *locale; uint32_t options; int32_t length; UErrorCode errorCode; errorCode=U_ZERO_ERROR; csm=ucasemap_open("tur", 0xa5, &errorCode); if(U_FAILURE(errorCode)) { log_err("ucasemap_open(\"tur\") failed - %s\n", u_errorName(errorCode)); return; } locale=ucasemap_getLocale(csm); if(0!=strcmp(locale, "tr")) { log_err("ucasemap_getLocale(ucasemap_open(\"tur\"))==%s!=\"tr\"\n", locale); } /* overly long locale IDs get truncated to their language code to avoid unnecessary allocation */ ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode); locale=ucasemap_getLocale(csm); if(0!=strcmp(locale, "i-klingon")) { log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s!=\"i-klingon\"\n", locale); } errorCode=U_ZERO_ERROR; options=ucasemap_getOptions(csm); if(options!=0xa5) { log_err("ucasemap_getOptions(ucasemap_open(0xa5))==0x%lx!=0xa5\n", (long)options); } ucasemap_setOptions(csm, 0x333333, &errorCode); options=ucasemap_getOptions(csm); if(options!=0x333333) { log_err("ucasemap_getOptions(ucasemap_setOptions(0x333333))==0x%lx!=0x333333\n", (long)options); } /* test case mapping API; not all permutations necessary due to shared implementation code */ /* NUL terminated source */ errorCode=U_ZERO_ERROR; length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) { log_err("ucasemap_utf8ToLower(aBc\\0) failed\n"); } /* incoming failure code */ errorCode=U_PARSE_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(errorCode!=U_PARSE_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(failure) failed\n"); } /* overlapping input & output */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, aBc); length=ucasemap_utf8ToUpper(csm, utf8Out, 2, utf8Out+1, 2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) { log_err("ucasemap_utf8ToUpper(overlap 1) failed\n"); } /* overlap in the other direction */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, aBc); length=ucasemap_utf8ToUpper(csm, utf8Out+1, 2, utf8Out, 2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) { log_err("ucasemap_utf8ToUpper(overlap 2) failed\n"); } /* NULL destination */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, NULL, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(dest=NULL) failed\n"); } /* destCapacity<0 */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, -2, aBc, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(destCapacity<0) failed\n"); } /* NULL source */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), NULL, -1, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(src=NULL) failed\n"); } /* srcLength<-1 */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -2, &errorCode); if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) { log_err("ucasemap_utf8ToLower(srcLength<-1) failed\n"); } /* buffer overflow */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToUpper(csm, utf8Out, 2, aBc, 3, &errorCode); if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3 || 0!=strcmp(defg+2, utf8Out+2)) { log_err("ucasemap_utf8ToUpper(overflow) failed\n"); } /* dest not terminated (leaves g from defg alone) */ errorCode=U_ZERO_ERROR; strcpy(utf8Out, defg); length=ucasemap_utf8ToUpper(csm, utf8Out, 3, aBc, 3, &errorCode); if(errorCode!=U_STRING_NOT_TERMINATED_WARNING || length!=3 || 0!=strcmp(ABCg, utf8Out)) { log_err("ucasemap_utf8ToUpper(overflow) failed\n"); } /* C API coverage for case folding. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */ errorCode=U_ZERO_ERROR; utf8Out[0]=0; length=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, 3, &errorCode); if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) { log_err("ucasemap_utf8FoldCase(aBc) failed\n"); } ucasemap_close(csm); }
char * Unicode_ToLower(const char *str, // IN const char *locale) // IN { UCaseMap *caseMap; UErrorCode status = U_ZERO_ERROR; char *utf8Dest; const char *utf8Src = (const char *)str; int32_t utf8SrcLen = strlen(utf8Src); int32_t destCapacity = utf8SrcLen + 1; int32_t destLen; char *result = NULL; /* * XXX TODO: This and the two following functions are substantially * identical. Refactor them! (Note that ucasemap_utf8ToTitle * takes a non-const UCaseMap, so we can't just use pointers to * functions unless we cast.) */ // Most lower-case operations don't change the length of the string. utf8Dest = (char *)Util_SafeMalloc(destCapacity); caseMap = ucasemap_open(locale, 0, &status); if (U_FAILURE(status)) { goto out; } destLen = ucasemap_utf8ToLower(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); if (status != U_BUFFER_OVERFLOW_ERROR) { goto out; } // If we need a bigger buffer, then reallocate and retry. destCapacity = destLen + 1; utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity); status = U_ZERO_ERROR; destLen = ucasemap_utf8ToLower(caseMap, utf8Dest, destCapacity, utf8Src, utf8SrcLen, &status); out: ucasemap_close(caseMap); if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) { result = utf8Dest; } else { ASSERT(U_SUCCESS(status)); ASSERT(status != U_STRING_NOT_TERMINATED_WARNING); } return result; }
/** * Convert case (upper, lowercase) * * * @param str character vector * @param locale single string identifying * the locale ("" or NULL for default locale) * @return character vector * * * @version 0.1-?? (Marek Gagolewski) * * @version 0.1-?? (Marek Gagolewski) * use StriContainerUTF16 * * @version 0.1-?? (Marek Gagolewski, 2013-06-16) * make StriException-friendly * * @version 0.1-?? (Marek Gagolewski, 2013-11-19) * use UCaseMap + StriContainerUTF8 * **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes * ** BTW, since stringi_0.1-25 we require ICU>=50 ** * * @version 0.2-1 (Marek Gagolewski, 2014-03-18) * use UCaseMap + StriContainerUTF8 * (this is much faster for UTF-8 and slightly faster for 8bit enc) * Estimates minimal buffer size. * * @version 0.3-1 (Marek Gagolewski, 2014-10-24) * Use a custom BreakIterator with stri_trans_totitle * * @version 0.3-1 (Marek Gagolewski, 2014-11-04) * Issue #112: str_prepare_arg* retvals were not PROTECTed from gc * * @version 0.4-1 (Marek Gagolewski, 2014-12-03) * use StriUBreakIterator * * @version 0.6-1 (Marek Gagolewski, 2015-07-11) * now this is an internal function */ SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale) { if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG); const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */ PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument // version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50) UCaseMap* ucasemap = NULL; STRI__ERROR_HANDLER_BEGIN(1) UErrorCode status = U_ZERO_ERROR; ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) R_len_t str_n = LENGTH(str); StriContainerUTF8 str_cont(str, str_n); SEXP ret; STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n)); // STEP 1. // Estimate the required buffer length // Notice: The resulting number of codepoints may be larger or smaller than // the number before casefolding R_len_t bufsize = str_cont.getMaxNumBytes(); bufsize += 10; // a small margin String8buf buf(bufsize); // STEP 2. // Do case folding for (R_len_t i = str_cont.vectorize_init(); i != str_cont.vectorize_end(); i = str_cont.vectorize_next(i)) { if (str_cont.isNA(i)) { SET_STRING_ELT(ret, i, NA_STRING); continue; } R_len_t str_cur_n = str_cont.get(i).length(); const char* str_cur_s = str_cont.get(i).c_str(); status = U_ZERO_ERROR; int buf_need; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); if (U_FAILURE(status)) { /* retry */ buf.resize(buf_need, false/*destroy contents*/); status = U_ZERO_ERROR; if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); else buf_need = ucasemap_utf8ToUpper(ucasemap, buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status); STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen // we do have the buffer size required to complete this op } SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8)); } if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;} STRI__UNPROTECT_ALL return ret; STRI__ERROR_HANDLER_END({ if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; } }) }