Example #1
0
void fts_icu_lcase(string_t *dest_utf8, const char *src_utf8)
{
    struct UCaseMap *csm = fts_icu_csm();
    size_t avail_bytes, dest_pos = dest_utf8->used;
    char *dest_data;
    int dest_full_len;
    UErrorCode err = U_ZERO_ERROR;

    avail_bytes = buffer_get_writable_size(dest_utf8) - dest_pos;
    dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, avail_bytes);

    dest_full_len = ucasemap_utf8ToLower(csm, dest_data, avail_bytes,
                                         src_utf8, -1, &err);
    if (err == U_BUFFER_OVERFLOW_ERROR) {
        err = U_ZERO_ERROR;
        dest_data = buffer_get_space_unsafe(dest_utf8, dest_pos, dest_full_len);
        dest_full_len = ucasemap_utf8ToLower(csm, dest_data, dest_full_len,
                                             src_utf8, -1, &err);
        i_assert(err != U_BUFFER_OVERFLOW_ERROR);
    }
    if (U_FAILURE(err)) {
        i_fatal("LibICU ucasemap_utf8ToLower() failed: %s",
                u_errorName(err));
    }
    buffer_set_used_size(dest_utf8, dest_full_len);
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    IcuTestErrorCode errorCode(*this, "TestCasingImpl");
    LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode));
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode);
        ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode);
    result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0);

    if(errorCode.isFailure()) {
        errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
        errorCode.reset();
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
        errln("expected \"" + output + "\" got \"" + result + "\"" );
    }
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    UCaseMap *csm;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open(localeID, options, &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        int32_t size=1;  // Not 0 because that only gives preflighting.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode);
        ucasemap_setBreakIterator(csm, clone, &errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode);
    result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0);

    if(U_FAILURE(errorCode)) {
        errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
    }
    ucasemap_close(csm);
}
Example #4
0
/*
 * API test for UCaseMap;
 * test cases for actual case mappings using UCaseMap see
 * intltest utility/UnicodeStringTest/StringCaseTest/TestCasing
 */
static void
TestUCaseMap(void) {
    static const char
        aBc[] ={ 0x61, 0x42, 0x63, 0 },
        abc[] ={ 0x61, 0x62, 0x63, 0 },
        ABCg[]={ 0x41, 0x42, 0x43, 0x67, 0 },
        defg[]={ 0x64, 0x65, 0x66, 0x67, 0 };
    char utf8Out[8];

    UCaseMap *csm;
    const char *locale;
    uint32_t options;
    int32_t length;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open("tur", 0xa5, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucasemap_open(\"tur\") failed - %s\n", u_errorName(errorCode));
        return;
    }
    locale=ucasemap_getLocale(csm);
    if(0!=strcmp(locale, "tr")) {
        log_err("ucasemap_getLocale(ucasemap_open(\"tur\"))==%s!=\"tr\"\n", locale);
    }
    /* overly long locale IDs get truncated to their language code to avoid unnecessary allocation */
    ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode);
    locale=ucasemap_getLocale(csm);
    if(0!=strcmp(locale, "i-klingon")) {
        log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s!=\"i-klingon\"\n", locale);
    }

    errorCode=U_ZERO_ERROR;
    options=ucasemap_getOptions(csm);
    if(options!=0xa5) {
        log_err("ucasemap_getOptions(ucasemap_open(0xa5))==0x%lx!=0xa5\n", (long)options);
    }
    ucasemap_setOptions(csm, 0x333333, &errorCode);
    options=ucasemap_getOptions(csm);
    if(options!=0x333333) {
        log_err("ucasemap_getOptions(ucasemap_setOptions(0x333333))==0x%lx!=0x333333\n", (long)options);
    }

    /* test case mapping API; not all permutations necessary due to shared implementation code */

    /* NUL terminated source */
    errorCode=U_ZERO_ERROR;
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) {
        log_err("ucasemap_utf8ToLower(aBc\\0) failed\n");
    }

    /* incoming failure code */
    errorCode=U_PARSE_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(errorCode!=U_PARSE_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(failure) failed\n");
    }

    /* overlapping input & output */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, aBc);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 2, utf8Out+1, 2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overlap 1) failed\n");
    }

    /* overlap in the other direction */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, aBc);
    length=ucasemap_utf8ToUpper(csm, utf8Out+1, 2, utf8Out, 2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overlap 2) failed\n");
    }

    /* NULL destination */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, NULL, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(dest=NULL) failed\n");
    }

    /* destCapacity<0 */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, -2, aBc, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(destCapacity<0) failed\n");
    }

    /* NULL source */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), NULL, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(src=NULL) failed\n");
    }

    /* srcLength<-1 */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(srcLength<-1) failed\n");
    }

    /* buffer overflow */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 2, aBc, 3, &errorCode);
    if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3 || 0!=strcmp(defg+2, utf8Out+2)) {
        log_err("ucasemap_utf8ToUpper(overflow) failed\n");
    }

    /* dest not terminated (leaves g from defg alone) */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 3, aBc, 3, &errorCode);
    if(errorCode!=U_STRING_NOT_TERMINATED_WARNING || length!=3 || 0!=strcmp(ABCg, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overflow) failed\n");
    }

    /* C API coverage for case folding. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
    errorCode=U_ZERO_ERROR;
    utf8Out[0]=0;
    length=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, 3, &errorCode);
    if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) {
        log_err("ucasemap_utf8FoldCase(aBc) failed\n");
    }

    ucasemap_close(csm);
}
Example #5
0
char *
Unicode_ToLower(const char *str,    // IN
                const char *locale) // IN
{
   UCaseMap *caseMap;
   UErrorCode status = U_ZERO_ERROR;
   char *utf8Dest;
   const char *utf8Src = (const char *)str;
   int32_t utf8SrcLen = strlen(utf8Src);
   int32_t destCapacity = utf8SrcLen + 1;
   int32_t destLen;
   char *result = NULL;

   /*
    * XXX TODO: This and the two following functions are substantially
    * identical.  Refactor them!  (Note that ucasemap_utf8ToTitle
    * takes a non-const UCaseMap, so we can't just use pointers to
    * functions unless we cast.)
    */

   // Most lower-case operations don't change the length of the string.
   utf8Dest = (char *)Util_SafeMalloc(destCapacity);

   caseMap = ucasemap_open(locale, 0, &status);
   if (U_FAILURE(status)) {
      goto out;
   }

   destLen = ucasemap_utf8ToLower(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

   if (status != U_BUFFER_OVERFLOW_ERROR) {
      goto out;
   }

   // If we need a bigger buffer, then reallocate and retry.
   destCapacity = destLen + 1;
   utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity);

   status = U_ZERO_ERROR;
   destLen = ucasemap_utf8ToLower(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

  out:
   ucasemap_close(caseMap);

   if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) {
      result = utf8Dest;
   } else {
      ASSERT(U_SUCCESS(status));
      ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   }

   return result;
}
Example #6
0
/**
 *  Convert case (upper, lowercase)
 *
 *
 *  @param str character vector
 *  @param locale single string identifying
 *         the locale ("" or NULL for default locale)
 *  @return character vector
 *
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-19)
 *          use UCaseMap + StriContainerUTF8
 *          **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes
 *          ** BTW, since stringi_0.1-25 we require ICU>=50 **
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-18)
 *          use UCaseMap + StriContainerUTF8
 *          (this is much faster for UTF-8 and slightly faster for 8bit enc)
 *          Estimates minimal buffer size.
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
 *          Use a custom BreakIterator with stri_trans_totitle
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    use StriUBreakIterator
 *
 * @version 0.6-1 (Marek Gagolewski, 2015-07-11)
 *    now this is an internal function
*/
SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale)
{
   if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need;
      if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
      else buf_need = ucasemap_utf8ToUpper(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) { /* retry */
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
         else buf_need = ucasemap_utf8ToUpper(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}