Ejemplo n.º 1
0
char *
Unicode_ToTitle(const char *str,    // IN
                const char *locale) // IN
{
   UCaseMap *caseMap;
   UErrorCode status = U_ZERO_ERROR;
   char *utf8Dest;
   const char *utf8Src = (const char *)str;
   int32_t utf8SrcLen = strlen(utf8Src);
   int32_t destCapacity = utf8SrcLen + 1;
   int32_t destLen;
   char *result = NULL;

   // Most title-case operations don't change the length of the string.
   utf8Dest = (char *)Util_SafeMalloc(destCapacity);

   caseMap = ucasemap_open(locale, 0, &status);
   if (U_FAILURE(status)) {
      goto out;
   }

   destLen = ucasemap_utf8ToTitle(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

   if (status != U_BUFFER_OVERFLOW_ERROR) {
      goto out;
   }

   // If we need a bigger buffer, then reallocate and retry.
   destCapacity = destLen + 1;
   utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity);

   status = U_ZERO_ERROR;
   destLen = ucasemap_utf8ToTitle(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

  out:
   ucasemap_close(caseMap);

   if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) {
      result = utf8Dest;
   } else {
      ASSERT(U_SUCCESS(status));
      ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   }

   return result;
}
Ejemplo n.º 2
0
static struct UCaseMap *fts_icu_csm(void)
{
    UErrorCode err = U_ZERO_ERROR;

    if (icu_csm != NULL)
        return icu_csm;
    icu_csm = ucasemap_open(NULL, U_FOLD_CASE_DEFAULT, &err);
    if (U_FAILURE(err)) {
        i_fatal("LibICU ucasemap_open() failed: %s",
                u_errorName(err));
    }
    return icu_csm;
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        dataerrln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            dataerrln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    IcuTestErrorCode errorCode(*this, "TestCasingImpl");
    LocalUCaseMapPointer csm(ucasemap_open(localeID, options, errorCode));
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, NULL, errorCode);
        ucasemap_setBreakIterator(csm.getAlias(), clone, errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm.getAlias(),
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, errorCode);
    result.releaseBuffer(errorCode.isSuccess() ? resultLength : 0);

    if(errorCode.isFailure()) {
        errcheckln(errorCode, "error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
        errorCode.reset();
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
        errln("expected \"" + output + "\" got \"" + result + "\"" );
    }
}
void
StringCaseTest::TestCasingImpl(const UnicodeString &input,
                               const UnicodeString &output,
                               int32_t whichCase,
                               void *iter, const char *localeID, uint32_t options) {
    // UnicodeString
    UnicodeString result;
    const char *name;
    Locale locale(localeID);

    result=input;
    switch(whichCase) {
    case TEST_LOWER:
        name="toLower";
        result.toLower(locale);
        break;
    case TEST_UPPER:
        name="toUpper";
        result.toUpper(locale);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="toTitle";
        result.toTitle((BreakIterator *)iter, locale, options);
        break;
#endif
    case TEST_FOLD:
        name="foldCase";
        result.foldCase(options);
        break;
    default:
        name="";
        break; // won't happen
    }
    if(result!=output) {
        errln("error: UnicodeString.%s() got a wrong result for a test case from casing.res", name);
    }
#if !UCONFIG_NO_BREAK_ITERATION
    if(whichCase==TEST_TITLE && options==0) {
        result=input;
        result.toTitle((BreakIterator *)iter, locale);
        if(result!=output) {
            errln("error: UnicodeString.toTitle(options=0) got a wrong result for a test case from casing.res");
        }
    }
#endif

    // UTF-8
    char utf8In[100], utf8Out[100];
    int32_t utf8InLength, utf8OutLength, resultLength;
    UChar *buffer;

    UCaseMap *csm;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open(localeID, options, &errorCode);
#if !UCONFIG_NO_BREAK_ITERATION
    if(iter!=NULL) {
        // Clone the break iterator so that the UCaseMap can safely adopt it.
        int32_t size=1;  // Not 0 because that only gives preflighting.
        UBreakIterator *clone=ubrk_safeClone((UBreakIterator *)iter, NULL, &size, &errorCode);
        ucasemap_setBreakIterator(csm, clone, &errorCode);
    }
#endif

    u_strToUTF8(utf8In, (int32_t)sizeof(utf8In), &utf8InLength, input.getBuffer(), input.length(), &errorCode);
    switch(whichCase) {
    case TEST_LOWER:
        name="ucasemap_utf8ToLower";
        utf8OutLength=ucasemap_utf8ToLower(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    case TEST_UPPER:
        name="ucasemap_utf8ToUpper";
        utf8OutLength=ucasemap_utf8ToUpper(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#if !UCONFIG_NO_BREAK_ITERATION
    case TEST_TITLE:
        name="ucasemap_utf8ToTitle";
        utf8OutLength=ucasemap_utf8ToTitle(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
#endif
    case TEST_FOLD:
        name="ucasemap_utf8FoldCase";
        utf8OutLength=ucasemap_utf8FoldCase(csm,
                    utf8Out, (int32_t)sizeof(utf8Out),
                    utf8In, utf8InLength, &errorCode);
        break;
    default:
        name="";
        utf8OutLength=0;
        break; // won't happen
    }
    buffer=result.getBuffer(utf8OutLength);
    u_strFromUTF8(buffer, result.getCapacity(), &resultLength, utf8Out, utf8OutLength, &errorCode);
    result.releaseBuffer(U_SUCCESS(errorCode) ? resultLength : 0);

    if(U_FAILURE(errorCode)) {
        errln("error: %s() got an error for a test case from casing.res - %s", name, u_errorName(errorCode));
    } else if(result!=output) {
        errln("error: %s() got a wrong result for a test case from casing.res", name);
    }
    ucasemap_close(csm);
}
Ejemplo n.º 5
0
/* Try titlecasing with options. */
static void
TestUCaseMapToTitle(void) {
    /* "a 'CaT. A 'dOg! 'eTc." where '=U+02BB */
    /*
     * Note: The sentence BreakIterator does not recognize a '.'
     * as a sentence terminator if it is followed by lowercase.
     * That is why the example has the '!'.
     */
    static const UChar

    beforeTitle[]=      { 0x61, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x54, 0x63, 0x2e },
    titleWord[]=        { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x44, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x74, 0x63, 0x2e },
    titleWordNoAdjust[]={ 0x41, 0x20, 0x2bb, 0x63, 0x61, 0x74, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x6f, 0x67, 0x21, 0x20, 0x2bb, 0x65, 0x74, 0x63, 0x2e },
    titleSentNoLower[]= { 0x41, 0x20, 0x2bb, 0x43, 0x61, 0x54, 0x2e, 0x20, 0x41, 0x20, 0x2bb, 0x64, 0x4f, 0x67, 0x21, 0x20, 0x2bb, 0x45, 0x54, 0x63, 0x2e };

    UChar buffer[32];
    UCaseMap *csm;
    UBreakIterator *sentenceIter;
    const UBreakIterator *iter;
    int32_t length;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open("", 0, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucasemap_open(\"\") failed - %s\n", u_errorName(errorCode));
        return;
    }

    iter=ucasemap_getBreakIterator(csm);
    if(iter!=NULL) {
        log_err("ucasemap_getBreakIterator() returns %p!=NULL before setting any iterator or titlecasing\n", iter);
    }

    /* Use default UBreakIterator: Word breaks. */
    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleWord) ||
        0!=u_memcmp(buffer, titleWord, length) ||
        buffer[length]!=0
    ) {
        log_err_status(errorCode, "ucasemap_toTitle(default iterator)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }
    if (U_SUCCESS(errorCode)) {
        iter=ucasemap_getBreakIterator(csm);
        if(iter==NULL) {
            log_err("ucasemap_getBreakIterator() returns NULL after titlecasing\n");
        }
    }

    /* Try U_TITLECASE_NO_BREAK_ADJUSTMENT. */
    ucasemap_setOptions(csm, U_TITLECASE_NO_BREAK_ADJUSTMENT, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err_status(errorCode, "error: ucasemap_setOptions(U_TITLECASE_NO_BREAK_ADJUSTMENT) failed - %s\n", u_errorName(errorCode));
        return;
    }

    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleWordNoAdjust) ||
        0!=u_memcmp(buffer, titleWordNoAdjust, length) ||
        buffer[length]!=0
    ) {
        log_err("ucasemap_toTitle(default iterator, no break adjustment)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    /* Set a sentence break iterator. */
    errorCode=U_ZERO_ERROR;
    sentenceIter=ubrk_open(UBRK_SENTENCE, "", NULL, 0, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ubrk_open(UBRK_SENTENCE) failed - %s\n", u_errorName(errorCode));
        ucasemap_close(csm);
        return;
    }
    ucasemap_setBreakIterator(csm, sentenceIter, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ucasemap_setBreakIterator(sentence iterator) failed - %s\n", u_errorName(errorCode));
        ubrk_close(sentenceIter);
        ucasemap_close(csm);
        return;
    }
    iter=ucasemap_getBreakIterator(csm);
    if(iter!=sentenceIter) {
        log_err("ucasemap_getBreakIterator() returns %p!=%p after setting the iterator\n", iter, sentenceIter);
    }

    ucasemap_setOptions(csm, U_TITLECASE_NO_LOWERCASE, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("error: ucasemap_setOptions(U_TITLECASE_NO_LOWERCASE) failed - %s\n", u_errorName(errorCode));
        return;
    }

    /* Use the sentence break iterator with the option. Preflight first. */
    length=ucasemap_toTitle(csm, NULL, 0, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( errorCode!=U_BUFFER_OVERFLOW_ERROR ||
        length!=UPRV_LENGTHOF(titleSentNoLower)
    ) {
        log_err("ucasemap_toTitle(preflight sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    errorCode=U_ZERO_ERROR;
    buffer[0]=0;
    length=ucasemap_toTitle(csm, buffer, UPRV_LENGTHOF(buffer), beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
    if( U_FAILURE(errorCode) ||
        length!=UPRV_LENGTHOF(titleSentNoLower) ||
        0!=u_memcmp(buffer, titleSentNoLower, length) ||
        buffer[length]!=0
    ) {
        log_err("ucasemap_toTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
    }

    /* UTF-8 C API coverage. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
    {
        char utf8BeforeTitle[64], utf8TitleSentNoLower[64], utf8[64];
        int32_t utf8BeforeTitleLength, utf8TitleSentNoLowerLength;

        errorCode=U_ZERO_ERROR;
        u_strToUTF8(utf8BeforeTitle, (int32_t)sizeof(utf8BeforeTitle), &utf8BeforeTitleLength, beforeTitle, UPRV_LENGTHOF(beforeTitle), &errorCode);
        u_strToUTF8(utf8TitleSentNoLower, (int32_t)sizeof(utf8TitleSentNoLower), &utf8TitleSentNoLowerLength, titleSentNoLower, UPRV_LENGTHOF(titleSentNoLower), &errorCode);

        length=ucasemap_utf8ToTitle(csm, utf8, (int32_t)sizeof(utf8), utf8BeforeTitle, utf8BeforeTitleLength, &errorCode);
        if( U_FAILURE(errorCode) ||
            length!=utf8TitleSentNoLowerLength ||
            0!=uprv_memcmp(utf8, utf8TitleSentNoLower, length) ||
            utf8[length]!=0
        ) {
            log_err("ucasemap_utf8ToTitle(sentence break iterator, no lowercasing)=%ld failed - %s\n", (long)length, u_errorName(errorCode));
        }
    }

    ucasemap_close(csm);
}
Ejemplo n.º 6
0
/*
 * API test for UCaseMap;
 * test cases for actual case mappings using UCaseMap see
 * intltest utility/UnicodeStringTest/StringCaseTest/TestCasing
 */
static void
TestUCaseMap(void) {
    static const char
        aBc[] ={ 0x61, 0x42, 0x63, 0 },
        abc[] ={ 0x61, 0x62, 0x63, 0 },
        ABCg[]={ 0x41, 0x42, 0x43, 0x67, 0 },
        defg[]={ 0x64, 0x65, 0x66, 0x67, 0 };
    char utf8Out[8];

    UCaseMap *csm;
    const char *locale;
    uint32_t options;
    int32_t length;
    UErrorCode errorCode;

    errorCode=U_ZERO_ERROR;
    csm=ucasemap_open("tur", 0xa5, &errorCode);
    if(U_FAILURE(errorCode)) {
        log_err("ucasemap_open(\"tur\") failed - %s\n", u_errorName(errorCode));
        return;
    }
    locale=ucasemap_getLocale(csm);
    if(0!=strcmp(locale, "tr")) {
        log_err("ucasemap_getLocale(ucasemap_open(\"tur\"))==%s!=\"tr\"\n", locale);
    }
    /* overly long locale IDs get truncated to their language code to avoid unnecessary allocation */
    ucasemap_setLocale(csm, "I-kLInGOn-the-quick-brown-fox-jumps-over-the-lazy-dog", &errorCode);
    locale=ucasemap_getLocale(csm);
    if(0!=strcmp(locale, "i-klingon")) {
        log_err("ucasemap_getLocale(ucasemap_setLocale(\"I-kLInGOn-the-quick-br...\"))==%s!=\"i-klingon\"\n", locale);
    }

    errorCode=U_ZERO_ERROR;
    options=ucasemap_getOptions(csm);
    if(options!=0xa5) {
        log_err("ucasemap_getOptions(ucasemap_open(0xa5))==0x%lx!=0xa5\n", (long)options);
    }
    ucasemap_setOptions(csm, 0x333333, &errorCode);
    options=ucasemap_getOptions(csm);
    if(options!=0x333333) {
        log_err("ucasemap_getOptions(ucasemap_setOptions(0x333333))==0x%lx!=0x333333\n", (long)options);
    }

    /* test case mapping API; not all permutations necessary due to shared implementation code */

    /* NUL terminated source */
    errorCode=U_ZERO_ERROR;
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) {
        log_err("ucasemap_utf8ToLower(aBc\\0) failed\n");
    }

    /* incoming failure code */
    errorCode=U_PARSE_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(errorCode!=U_PARSE_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(failure) failed\n");
    }

    /* overlapping input & output */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, aBc);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 2, utf8Out+1, 2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overlap 1) failed\n");
    }

    /* overlap in the other direction */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, aBc);
    length=ucasemap_utf8ToUpper(csm, utf8Out+1, 2, utf8Out, 2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(aBc, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overlap 2) failed\n");
    }

    /* NULL destination */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, NULL, (int32_t)sizeof(utf8Out), aBc, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(dest=NULL) failed\n");
    }

    /* destCapacity<0 */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, -2, aBc, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(destCapacity<0) failed\n");
    }

    /* NULL source */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), NULL, -1, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(src=NULL) failed\n");
    }

    /* srcLength<-1 */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToLower(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, -2, &errorCode);
    if(errorCode!=U_ILLEGAL_ARGUMENT_ERROR || 0!=strcmp(defg, utf8Out)) {
        log_err("ucasemap_utf8ToLower(srcLength<-1) failed\n");
    }

    /* buffer overflow */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 2, aBc, 3, &errorCode);
    if(errorCode!=U_BUFFER_OVERFLOW_ERROR || length!=3 || 0!=strcmp(defg+2, utf8Out+2)) {
        log_err("ucasemap_utf8ToUpper(overflow) failed\n");
    }

    /* dest not terminated (leaves g from defg alone) */
    errorCode=U_ZERO_ERROR;
    strcpy(utf8Out, defg);
    length=ucasemap_utf8ToUpper(csm, utf8Out, 3, aBc, 3, &errorCode);
    if(errorCode!=U_STRING_NOT_TERMINATED_WARNING || length!=3 || 0!=strcmp(ABCg, utf8Out)) {
        log_err("ucasemap_utf8ToUpper(overflow) failed\n");
    }

    /* C API coverage for case folding. More thorough test via C++ intltest's StringCaseTest::TestCasing(). */
    errorCode=U_ZERO_ERROR;
    utf8Out[0]=0;
    length=ucasemap_utf8FoldCase(csm, utf8Out, (int32_t)sizeof(utf8Out), aBc, 3, &errorCode);
    if(U_FAILURE(errorCode) || length!=3 || 0!=strcmp(abc, utf8Out)) {
        log_err("ucasemap_utf8FoldCase(aBc) failed\n");
    }

    ucasemap_close(csm);
}
Ejemplo n.º 7
0
char   *casefoldx(int flags, VSTRING *dest, const char *src, ssize_t len)
{
    size_t  old_len;

#ifdef NO_EAI

    /*
     * ASCII mode only.
     */
    if (len < 0)
	len = strlen(src);
    if ((flags & CASEF_FLAG_APPEND) == 0)
	VSTRING_RESET(dest);
    old_len = VSTRING_LEN(dest);
    vstring_strncat(dest, src, len);
    lowercase(STR(dest) + old_len);
    return (STR(dest));
#else

    /*
     * Unicode mode.
     */
    const char myname[] = "casefold";
    static VSTRING *fold_buf = 0;
    static UCaseMap *csm = 0;
    UErrorCode error;
    ssize_t space_needed;
    int     n;

    /*
     * Handle special cases.
     */
    if (len < 0)
	len = strlen(src);
    if (dest == 0)
	dest = (fold_buf != 0 ? fold_buf : (fold_buf = vstring_alloc(100)));
    if ((flags & CASEF_FLAG_APPEND) == 0)
	VSTRING_RESET(dest);
    old_len = VSTRING_LEN(dest);

    /*
     * All-ASCII input, or ASCII mode only.
     */
    if ((flags & CASEF_FLAG_UTF8) == 0 || allascii(src)) {
	vstring_strncat(dest, src, len);
	lowercase(STR(dest) + old_len);
	return (STR(dest));
    }

    /*
     * ICU 4.8 ucasemap_utf8FoldCase() does not complain about UTF-8 syntax
     * errors. XXX Based on source-code review we conclude that non-UTF-8
     * bytes are copied verbatim, and experiments confirm this. Given that
     * this behavior is intentional, we assume that it will stay that way.
     */
#if 0
    if (valid_utf8_string(src, len) == 0) {
	if (err)
	    *err = "malformed UTF-8 or invalid codepoint";
	return (0);
    }
#endif

    /*
     * One-time initialization. With ICU 4.8 this works while chrooted.
     */
    if (csm == 0) {
	error = U_ZERO_ERROR;
	csm = ucasemap_open("en_US", U_FOLD_CASE_DEFAULT, &error);
	if (U_SUCCESS(error) == 0)
	    msg_fatal("ucasemap_open error: %s", u_errorName(error));
    }

    /*
     * Fold the input, adjusting the buffer size if needed. Safety: don't
     * loop forever.
     * 
     * Note: the requested amount of space for casemapped output (as reported
     * with space_needed below) does not include storage for the null
     * terminator. The terminator is written only when the output buffer is
     * large enough. This is why we overallocate space when the output does
     * not fit. But if the output fits exactly, then the ouput will be
     * unterminated, and we have to terminate the output ourselves.
     */
    for (n = 0; n < 3; n++) {
	error = U_ZERO_ERROR;
	space_needed = ucasemap_utf8FoldCase(csm, STR(dest) + old_len,
				     vstring_avail(dest), src, len, &error);
	if (U_SUCCESS(error)) {
	    VSTRING_AT_OFFSET(dest, old_len + space_needed);
	    if (vstring_avail(dest) == 0)	/* exact fit, no terminator */
		VSTRING_TERMINATE(dest);	/* add terminator */
	    break;
	} else if (error == U_BUFFER_OVERFLOW_ERROR) {
	    VSTRING_SPACE(dest, space_needed + 1);	/* for terminator */
	} else {
	    msg_fatal("%s: conversion error for \"%s\": %s",
		      myname, src, u_errorName(error));
	}
    }
    return (STR(dest));
#endif						/* NO_EAI */
}
Ejemplo n.º 8
0
char *
Unicode_ToLower(const char *str,    // IN
                const char *locale) // IN
{
   UCaseMap *caseMap;
   UErrorCode status = U_ZERO_ERROR;
   char *utf8Dest;
   const char *utf8Src = (const char *)str;
   int32_t utf8SrcLen = strlen(utf8Src);
   int32_t destCapacity = utf8SrcLen + 1;
   int32_t destLen;
   char *result = NULL;

   /*
    * XXX TODO: This and the two following functions are substantially
    * identical.  Refactor them!  (Note that ucasemap_utf8ToTitle
    * takes a non-const UCaseMap, so we can't just use pointers to
    * functions unless we cast.)
    */

   // Most lower-case operations don't change the length of the string.
   utf8Dest = (char *)Util_SafeMalloc(destCapacity);

   caseMap = ucasemap_open(locale, 0, &status);
   if (U_FAILURE(status)) {
      goto out;
   }

   destLen = ucasemap_utf8ToLower(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

   if (status != U_BUFFER_OVERFLOW_ERROR) {
      goto out;
   }

   // If we need a bigger buffer, then reallocate and retry.
   destCapacity = destLen + 1;
   utf8Dest = (char *)Util_SafeRealloc(utf8Dest, destCapacity);

   status = U_ZERO_ERROR;
   destLen = ucasemap_utf8ToLower(caseMap,
                                  utf8Dest,
                                  destCapacity,
                                  utf8Src,
                                  utf8SrcLen,
                                  &status);

  out:
   ucasemap_close(caseMap);

   if (U_SUCCESS(status) && status != U_STRING_NOT_TERMINATED_WARNING) {
      result = utf8Dest;
   } else {
      ASSERT(U_SUCCESS(status));
      ASSERT(status != U_STRING_NOT_TERMINATED_WARNING);
   }

   return result;
}
Ejemplo n.º 9
0
/**
 *  Convert case (TitleCase)
 *
 *
 *  @param str character vector
 *  @param opts_brkiter list
 *  @return character vector
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    separated from stri_trans_casemap;
 *    use StriUBreakIterator
 */
SEXP stri_trans_totitle(SEXP str, SEXP opts_brkiter) {
   StriBrkIterOptions opts_brkiter2(opts_brkiter, "word");
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   StriUBreakIterator brkiter(opts_brkiter2);

   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(brkiter.getLocale(), U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   status = U_ZERO_ERROR;
   ucasemap_setBreakIterator(ucasemap, brkiter.getIterator(), &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   brkiter.free(false);
   // ucasemap_setOptions(ucasemap, U_TITLECASE_NO_LOWERCASE, &status); // to do?
   // now briter is owned by ucasemap.
   // it will be released on ucasemap_close
   // (checked with ICU man & src code)

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) {
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         buf_need = ucasemap_utf8ToTitle(ucasemap, buf.data(), buf.size(),
               (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}
Ejemplo n.º 10
0
/**
 *  Convert case (upper, lowercase)
 *
 *
 *  @param str character vector
 *  @param locale single string identifying
 *         the locale ("" or NULL for default locale)
 *  @return character vector
 *
 *
 * @version 0.1-?? (Marek Gagolewski)
 *
 * @version 0.1-?? (Marek Gagolewski)
 *          use StriContainerUTF16
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-06-16)
 *          make StriException-friendly
 *
 * @version 0.1-?? (Marek Gagolewski, 2013-11-19)
 *          use UCaseMap + StriContainerUTF8
 *          **THIS DOES NOT WORK WITH ICU 4.8**, we have to revert the changes
 *          ** BTW, since stringi_0.1-25 we require ICU>=50 **
 *
 * @version 0.2-1 (Marek Gagolewski, 2014-03-18)
 *          use UCaseMap + StriContainerUTF8
 *          (this is much faster for UTF-8 and slightly faster for 8bit enc)
 *          Estimates minimal buffer size.
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-10-24)
 *          Use a custom BreakIterator with stri_trans_totitle
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-03)
 *    use StriUBreakIterator
 *
 * @version 0.6-1 (Marek Gagolewski, 2015-07-11)
 *    now this is an internal function
*/
SEXP stri_trans_casemap(SEXP str, int _type, SEXP locale)
{
   if (_type < 1 || _type > 2) Rf_error(MSG__INCORRECT_INTERNAL_ARG);
   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   PROTECT(str = stri_prepare_arg_string(str, "str")); // prepare string argument

// version 0.2-1 - Does not work with ICU 4.8 (but we require ICU >= 50)
   UCaseMap* ucasemap = NULL;

   STRI__ERROR_HANDLER_BEGIN(1)
   UErrorCode status = U_ZERO_ERROR;
   ucasemap = ucasemap_open(qloc, U_FOLD_CASE_DEFAULT, &status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_n = LENGTH(str);
   StriContainerUTF8 str_cont(str, str_n);
   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(STRSXP, str_n));


   // STEP 1.
   // Estimate the required buffer length
   // Notice: The resulting number of codepoints may be larger or smaller than
   // the number before casefolding
   R_len_t bufsize = str_cont.getMaxNumBytes();
   bufsize += 10; // a small margin
   String8buf buf(bufsize);

   // STEP 2.
   // Do case folding
   for (R_len_t i = str_cont.vectorize_init();
         i != str_cont.vectorize_end();
         i = str_cont.vectorize_next(i))
   {
      if (str_cont.isNA(i)) {
         SET_STRING_ELT(ret, i, NA_STRING);
         continue;
      }

      R_len_t str_cur_n     = str_cont.get(i).length();
      const char* str_cur_s = str_cont.get(i).c_str();

      status = U_ZERO_ERROR;
      int buf_need;
      if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
      else buf_need = ucasemap_utf8ToUpper(ucasemap,
         buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

      if (U_FAILURE(status)) { /* retry */
         buf.resize(buf_need, false/*destroy contents*/);
         status = U_ZERO_ERROR;
         if (_type == 1) buf_need = ucasemap_utf8ToLower(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);
         else buf_need = ucasemap_utf8ToUpper(ucasemap,
            buf.data(), buf.size(), (const char*)str_cur_s, str_cur_n, &status);

         STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */}) // this shouldn't happen
                                             // we do have the buffer size required to complete this op
      }

      SET_STRING_ELT(ret, i, Rf_mkCharLenCE(buf.data(), buf_need, CE_UTF8));
   }

   if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL;}
   STRI__UNPROTECT_ALL
   return ret;

   STRI__ERROR_HANDLER_END({
      if (ucasemap) { ucasemap_close(ucasemap); ucasemap = NULL; }
   })
}