char *
Unicode_ToTitle(const char *str,    // IN
                const char *locale) // IN
{
   UCaseMap *caseMap;
   UErrorCode status = U_ZERO_ERROR;
   char *utf8Dest;
   const char *utf8Src = (const char *)str;
   int32_t utf8SrcLen = strlen(utf8Src);
   int32_t destCapacity = utf8SrcLen + 1;
   int32_t destLen;
   char *result = NULL;

   // Most title-case operations don't change the length of the string.
   utf8Dest = (char *)Util_SafeMalloc(destCapacity);

   caseMap = ucasemap_open(locale, 0, &status);
   if (U_FAILURE(status)) {
      goto out;
   }

   destLen = ucasemap_utf8ToTitle(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

   if (status != U_BUFFER_OVERFLOW_ERROR) {
      goto out;
   }

   // If we need a bigger buffer, then reallocate and retry.
   destCapacity = destLen + 1;
   utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity);

   status = U_ZERO_ERROR;
   destLen = ucasemap_utf8ToTitle(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

  out:
   ucasemap_close(caseMap);

   if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) {
      result = utf8Dest;
   } else {
      ASSERT(U_SUCCESS(status));
      ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   }

   return result;
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    IcuTestErrorCode errorCode(*this, "TestCasingImpl");
    LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode));
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode);
        ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode);
    result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0);

    if(errorCode.isFailure()) {
        errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
        errorCode.reset();
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
        errln("expected \"" + output + "\" got \"" + result + "\"" );
    }
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    UCaseMap *csm;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open(localeID, options, &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        int32_t size=1;  // Not 0 because that only gives preflighting.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode);
        ucasemap_setBreakIterator(csm, clone, &errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode);
    result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0);

    if(U_FAILURE(errorCode)) {
        errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
    }
    ucasemap_close(csm);
}
Beispiel #4
0
/* Try titlecasing with options. */
static void
TestUCaseMapToTitle(void) {
    /* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */
    /*
     * Note: The sentence BreakIterator does not recognize a '.'
     * as a sentence terminator if it is followed by lowercase.
     * That is why the example has the '!'.
     */
    static const UChar

    beforeTitle[]=      { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e },
    titleWord[]=        { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e },
    titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e },
    titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e };

    UChar buffer[32];
    UCaseMap *csm;
    UBreakIterator *sentenceIter;
    const UBreakIterator *iter;
    int32_t length;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open("", 0, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode));
        return;
    }

    iter=ucasemap_getBreakIterator(csm);
    if(iter!=NULL) {
        log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter);
    }

    /* Use default UBreakIterator: Word breaks. */
    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleWord) ||
        0!=u_memcmp(buffer, titleWord, length) ||
        buffer[length]!=0
    ) {
        log_err_status(errorCode, "ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }
    if (U_SUCCESS(errorCode)) {
        iter=ucasemap_getBreakIterator(csm);
        if(iter==NULL) {
            log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n");
        }
    }

    /* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */
    ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err_status(errorCode, "error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode));
        return;
    }

    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleWordNoAdjust) ||
        0!=u_memcmp(buffer, titleWordNoAdjust, length) ||
        buffer[length]!=0
    ) {
        log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    /* Set a sentence break iterator. */
    errorCode=U_ZERO_ERROR;
    sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode));
        ucasemap_close(csm);
        return;
    }
    ucasemap_setBreakIterator(csm, sentenceIter, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode));
        ubrk_close(sentenceIter);
        ucasemap_close(csm);
        return;
    }
    iter=ucasemap_getBreakIterator(csm);
    if(iter!=sentenceIter) {
        log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter);
    }

    ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode));
        return;
    }

    /* Use the sentence break iterator with the option. Preflight first. */
    length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
        length!=UPRV_LENGTHOF(titleSentNoLower)
    ) {
        log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    errorCode=U_ZERO_ERROR;
    buffer[0]=0;
    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleSentNoLower) ||
        0!=u_memcmp(buffer, titleSentNoLower, length) ||
        buffer[length]!=0
    ) {
        log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    /* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
    {
        char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64];
        int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength;

        errorCode=U_ZERO_ERROR;
        u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
        u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, UPRV_LENGTHOF(titleSentNoLower), &errorCode);

        length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode);
        if( U_FAILURE(errorCode) ||
            length!=utf8TitleSentNoLowerLength ||
            0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) ||
            utf8[length]!=0
        ) {
            log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
        }
    }

    ucasemap_close(csm);
}
Beispiel #5
0
/**
 *  Convert case (TitleCase)
 *
 *
 *  @param str character vector
 *  @param opts_brkiter list
 *  @return character vector
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    separated from stri_trans_casemap;
 *    use StriUBreakIterator
 */
SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) {
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "word");
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   StriUBreakIterator brkiter(opts_brkiter2);

   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   brkiter.free(false);
   // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do?
   // now briter is owned by ucasemap.
   // it will be released on ucasemap_close
   // (checked with ICU man & src code)

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) {
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}