Beispiel #1
0
void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate)
{
    int32_t itemCount = uset_getItemCount(set), len = 0;
    int32_t index = m_rand() % itemCount;
    UChar32 rangeStart = 0, rangeEnd = 0;
    UChar buffer[16];
    UErrorCode err = U_ZERO_ERROR;

    len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err);

    if (len == 0) {
        int32_t offset = m_rand() % (rangeEnd - rangeStart + 1);
        UChar32 ch = rangeStart + offset;
        UnicodeString str(ch);

        testCase.append(str);
        generateAlternative(str, alternate);
    } else if (len > 0) {
        // should check that len < 16...
        UnicodeString str(buffer, len);

        testCase.append(str);
        generateAlternative(str, alternate);
    } else {
        // shouldn't happen...
    }
}
Beispiel #2
0
/**
 * Get a set containing the contractions defined by the collator. The set includes
 * both the UCA contractions and the contractions defined by the collator
 * @param coll collator
 * @param conts the set to hold the result
 * @param status to hold the error code
 * @return the size of the contraction set
 */
U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
                  USet *contractions,
                  UErrorCode *status)
{
  ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
  return uset_getItemCount(contractions);
}
Beispiel #3
0
CFCharacterSetRef _CFCreateCharacterSetFromUSet(USet *set) {
    UErrorCode icuErr = U_ZERO_ERROR;
    CFMutableCharacterSetRef working = CFCharacterSetCreateMutable(NULL);
    UChar   buffer[2048];   // Suitable for most small sets
    int32_t stringLen;

    if (working == NULL)
        return NULL;

    int32_t itemCount = uset_getItemCount(set);
    int32_t i;
    for (i = 0; i < itemCount; ++i)
    {
        UChar32   start, end;
        UChar * string;

        string = buffer;
        stringLen = uset_getItem(set, i, &start, &end, buffer, sizeof(buffer)/sizeof(UChar), &icuErr);
        if (icuErr == U_BUFFER_OVERFLOW_ERROR)
        {
            string = (UChar *) malloc(sizeof(UChar)*(stringLen+1));
            if (!string)
            {
                CFRelease(working);
                return NULL;
            }
            icuErr = U_ZERO_ERROR;
            (void) uset_getItem(set, i, &start, &end, string, stringLen+1, &icuErr);
        }
        if (U_FAILURE(icuErr))
        {
            if (string != buffer)
                free(string);
            CFRelease(working);
            return NULL;
        }
        if (stringLen <= 0)
            CFCharacterSetAddCharactersInRange(working, CFRangeMake(start, end-start+1));
        else
        {
            CFStringRef cfString = CFStringCreateWithCharactersNoCopy(kCFAllocatorSystemDefault, (UniChar *)string, stringLen, kCFAllocatorNull);
            CFCharacterSetAddCharactersInString(working, cfString);
            CFRelease(cfString);
        }
        if (string != buffer)
            free(string);
    }
    
    CFCharacterSetRef   result = CFCharacterSetCreateCopy(kCFAllocatorSystemDefault, working);
    CFRelease(working);
    return result;
}
Beispiel #4
0
U_CDECL_END



/**
 * Get a set containing the contractions defined by the collator. The set includes
 * both the UCA contractions and the contractions defined by the collator
 * @param coll collator
 * @param conts the set to hold the result
 * @param status to hold the error code
 * @return the size of the contraction set
 */
U_CAPI int32_t U_EXPORT2
ucol_getContractions( const UCollator *coll,
                  USet *contractions,
                  UErrorCode *status)
{
  ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status);
  return uset_getItemCount(contractions);
}
Beispiel #5
0
// Collator.contractions {{{
static PyObject *
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    UErrorCode status = U_ZERO_ERROR;
    UChar *str = NULL;
    UChar32 start=0, end=0;
    int32_t count = 0, len = 0, i;
    PyObject *ans = Py_None, *pbuf;

    if (self->contractions == NULL) {
        self->contractions = uset_open(1, 0);
        if (self->contractions == NULL) return PyErr_NoMemory();
        self->contractions = ucol_getTailoredSet(self->collator, &status);
    }
    status = U_ZERO_ERROR; 
    count = uset_getItemCount(self->contractions);

    str = (UChar*)calloc(100, sizeof(UChar));
    if (str == NULL) { PyErr_NoMemory(); goto end; }
    ans = PyTuple_New(count);
    if (ans == NULL) { goto end; }

    for (i = 0; i < count; i++) {
        len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
        if (len >= 2) {
            // We have a string
            status = U_ZERO_ERROR;
            pbuf = icu_to_python(str, len);
            if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; }
            PyTuple_SetItem(ans, i, pbuf);
        } else {
            // Ranges dont make sense for contractions, ignore them
            PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None);
        }
    }
end:
    if (str != NULL) free(str);
  
    return ans;
} // }}}
Beispiel #6
0
static void generateSelectorData(UConverterSelector* result,
                                 UPropsVectors *upvec,
                                 const USet* excludedCodePoints,
                                 const UConverterUnicodeSet whichSet,
                                 UErrorCode* status) {
  if (U_FAILURE(*status)) {
    return;
  }

  int32_t columns = (result->encodingsCount+31)/32;

  // set errorValue to all-ones
  for (int32_t col = 0; col < columns; col++) {
    upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP,
                   col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status);
  }

  for (int32_t i = 0; i < result->encodingsCount; ++i) {
    uint32_t mask;
    uint32_t column;
    int32_t item_count;
    int32_t j;
    UConverter* test_converter = ucnv_open(result->encodings[i], status);
    if (U_FAILURE(*status)) {
      return;
    }
    USet* unicode_point_set;
    unicode_point_set = uset_open(1, 0);  // empty set

    ucnv_getUnicodeSet(test_converter, unicode_point_set,
                       whichSet, status);
    if (U_FAILURE(*status)) {
      ucnv_close(test_converter);
      return;
    }

    column = i / 32;
    mask = 1 << (i%32);
    // now iterate over intervals on set i!
    item_count = uset_getItemCount(unicode_point_set);

    for (j = 0; j < item_count; ++j) {
      UChar32 start_char;
      UChar32 end_char;
      UErrorCode smallStatus = U_ZERO_ERROR;
      uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0,
                   &smallStatus);
      if (U_FAILURE(smallStatus)) {
        // this will be reached for the converters that fill the set with
        // strings. Those should be ignored by our system
      } else {
        upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask,
                       status);
      }
    }
    ucnv_close(test_converter);
    uset_close(unicode_point_set);
    if (U_FAILURE(*status)) {
      return;
    }
  }

  // handle excluded encodings! Simply set their values to all 1's in the upvec
  if (excludedCodePoints) {
    int32_t item_count = uset_getItemCount(excludedCodePoints);
    for (int32_t j = 0; j < item_count; ++j) {
      UChar32 start_char;
      UChar32 end_char;

      uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0,
                   status);
      for (int32_t col = 0; col < columns; col++) {
        upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0),
                      status);
      }
    }
  }

  // alright. Now, let's put things in the same exact form you'd get when you
  // unserialize things.
  result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status);
  result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status);
  result->pvCount *= columns;  // number of uint32_t = rows * columns
  result->ownPv = TRUE;
}
Beispiel #7
0
CollData::CollData(UCollator *collator, UErrorCode &status)
    : coll(NULL), ceToCharsStartingWith(NULL)
{
    // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]]
    // i.e. other, control, private use, format, surrogate
    U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20);
    U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20);
    USet *charsToTest = uset_openPattern(test_pattern, 20, &status);

    // Han ext. A, Han, Jamo, Hangul, Han Ext. B
    // i.e. all the characers we handle implicitly
    U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70);
    USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status);

    if (U_FAILURE(status)) {
        return;
    }

    USet *expansions   = uset_openEmpty();
    USet *contractions = uset_openEmpty();
    int32_t itemCount;

    ceToCharsStartingWith = new CEToStringsMap(status);

    if (U_FAILURE(status)) {
        goto bail;
    }

#ifdef CLONE_COLLATOR
    coll = ucol_safeClone(collator, NULL, NULL, &status);

    if (U_FAILURE(status)) {
        goto bail;
    }
#else
    coll = collator;
#endif

    ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status);

    uset_addAll(charsToTest, contractions);
    uset_addAll(charsToTest, expansions);
    uset_removeAll(charsToTest, charsToRemove);

    itemCount = uset_getItemCount(charsToTest);
    for(int32_t item = 0; item < itemCount; item += 1) {
        UChar32 start = 0, end = 0;
        UChar buffer[16];
        int32_t len = uset_getItem(charsToTest, item, &start, &end,
                                   buffer, 16, &status);

        if (len == 0) {
            for (UChar32 ch = start; ch <= end; ch += 1) {
                UnicodeString *st = new UnicodeString(ch);

                if (st == NULL) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    break;
                }

                CEList *ceList = new CEList(coll, *st, status);

                ceToCharsStartingWith->put(ceList->get(0), st, status);

                delete ceList;
                delete st;
            }
        } else if (len > 0) {
            UnicodeString *st = new UnicodeString(buffer, len);

            if (st == NULL) {
                status = U_MEMORY_ALLOCATION_ERROR;
                break;
            }

            CEList *ceList = new CEList(coll, *st, status);

            ceToCharsStartingWith->put(ceList->get(0), st, status);

            delete ceList;
            delete st;
        } else {
            // shouldn't happen...
        }

        if (U_FAILURE(status)) {
             break;
        }
    }

bail:
    uset_close(contractions);
    uset_close(expansions);
    uset_close(charsToRemove);
    uset_close(charsToTest);

    if (U_FAILURE(status)) {
        return;
    }

    UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status);
    if (U_FAILURE(status)) {
        return;
    }
    UnicodeSetIterator hanIter(hanRanges);
    UnicodeString hanString;
    while(hanIter.nextRange()) {
        hanString.append(hanIter.getCodepoint());
        hanString.append(hanIter.getCodepointEnd());
    }
    // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9,
    // but as of Unicode 6.3 the 11xx block is filled,
    // and there are also more Jamo T at U+D7CB..U+D7FB.
    // Maybe use [:HST=T:] and look for the end of the last range?
    // Maybe use script boundary mappings instead of this code??
    UChar  jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF};
     UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges));
     CEList hanList(coll, hanString, status);
     CEList jamoList(coll, jamoString, status);
     int32_t j = 0;

     if (U_FAILURE(status)) {
         return;
     }

     for (int32_t c = 0; c < jamoList.size(); c += 1) {
         uint32_t jce = jamoList[c];

         if (! isContinuation(jce)) {
             jamoLimits[j++] = jce;
         }
     }

     jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT);

     minHan = 0xFFFFFFFF;
     maxHan = 0;

     for(int32_t h = 0; h < hanList.size(); h += 2) {
         uint32_t han = (uint32_t) hanList[h];

         if (han < minHan) {
             minHan = han;
         }

         if (han > maxHan) {
             maxHan = han;
         }
     }

     maxHan += (1 << UCOL_PRIMARYORDERSHIFT);
}
Beispiel #8
0
static void expectItems(const USet* set,
                        const char* items) {
    const char* p = items;
    UChar ustr[4096], itemStr[4096];
    char buf[4096];
    char *pat;
    UErrorCode ec;
    int32_t expectedSize = 0;
    int32_t itemCount = uset_getItemCount(set);
    int32_t itemIndex = 0;
    UChar32 start = 1, end = 0;
    int32_t itemLen = 0, length;

    ec = U_ZERO_ERROR;
    length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec);
    if (U_FAILURE(ec)) {
        log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec));
        return;
    }
    pat=aescstrdup(ustr, length);

    if (uset_isEmpty(set) != (strlen(items)==0)) {
        log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n",
                pat,
                strlen(items)==0 ? "TRUE" : "FALSE");
    }

    /* Don't test patterns starting with "[^" */
    if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) {
        return;
    }

    while (*p) {

        ++expectedSize;

        if (start > end || start == -1) {
            /* Fetch our next item */
            if (itemIndex >= itemCount) {
                log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat);
                return;
            }

            itemLen = uset_getItem(set, itemIndex, &start, &end,
                                   itemStr, sizeof(itemStr), &ec);
            if (U_FAILURE(ec) || itemLen < 0) {
                log_err("FAIL: uset_getItem => %s\n", u_errorName(ec));
                return;
            }

            if (itemLen == 0) {
                log_verbose("Ok: %s item %d is %c-%c\n", pat,
                            itemIndex, oneUCharToChar(start),
                            oneUCharToChar(end));
            } else {
                itemStr[itemLen] = 0;
                u_UCharsToChars(itemStr, buf, itemLen+1);
                log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf);
            }

            ++itemIndex;
        }

        if (*p=='{') {
            const char* stringStart = ++p;
            int32_t stringLength = 0;
            char strCopy[64];

            while (*p++ != '}') {
            }
            stringLength = (int32_t)(p - stringStart - 1);
            strncpy(strCopy, stringStart, stringLength);
            strCopy[stringLength] = 0;

            u_charsToUChars(stringStart, ustr, stringLength);
            ustr[stringLength] = 0;
            
            if (itemLen == 0) {
                log_err("FAIL: for %s expect \"%s\" next, but got a char\n",
                        pat, strCopy);
                return;
            }

            if (u_strcmp(ustr, itemStr) != 0) {
                log_err("FAIL: for %s expect \"%s\" next\n",
                        pat, strCopy);
                return;
            }
        }

        else {
            UChar32 c;

            u_charsToUChars(p, ustr, 1);
            c = ustr[0];

            if (itemLen != 0) {
                log_err("FAIL: for %s expect '%c' next, but got a string\n",
                        pat, *p);
                return;
            }

            if (c != start++) {
                log_err("FAIL: for %s expect '%c' next\n",
                        pat, *p);
                return;
            }

            ++p;
        }
    }

    if (uset_size(set) == expectedSize) {
        log_verbose("Ok: %s size is %d\n", pat, expectedSize);
    } else {
        log_err("FAIL: %s size is %d, expected %d\n",
                pat, uset_size(set), expectedSize);
    }
}
Beispiel #9
-1
// Collator.contractions {{{
static PyObject *
icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) {
    UErrorCode status = U_ZERO_ERROR;
    UChar *str;
    UChar32 start=0, end=0;
    int32_t count = 0, len = 0, dlen = 0, i;
    PyObject *ans = Py_None, *pbuf;
    wchar_t *buf;

    if (self->contractions == NULL) {
        self->contractions = uset_open(1, 0);
        if (self->contractions == NULL) return PyErr_NoMemory();
        self->contractions = ucol_getTailoredSet(self->collator, &status);
    }
    status = U_ZERO_ERROR; 

    str = (UChar*)calloc(100, sizeof(UChar));
    buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t));
    if (str == NULL || buf == NULL) return PyErr_NoMemory();

    count = uset_getItemCount(self->contractions);
    ans = PyTuple_New(count);
    if (ans != NULL) {
        for (i = 0; i < count; i++) {
            len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status);
            if (len >= 2) {
                // We have a string
                status = U_ZERO_ERROR;
                u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status);
                pbuf = PyUnicode_FromWideChar(buf, dlen);
                if (pbuf == NULL) return PyErr_NoMemory();
                PyTuple_SetItem(ans, i, pbuf);
            } else {
                // Ranges dont make sense for contractions, ignore them
                PyTuple_SetItem(ans, i, Py_None);
            }
        }
    }
    free(str); free(buf);
  
    return Py_BuildValue("O", ans);
} // }}}