void StringSetMonkey::append(UnicodeString &testCase, UnicodeString &alternate) { int32_t itemCount = uset_getItemCount(set), len = 0; int32_t index = m_rand() % itemCount; UChar32 rangeStart = 0, rangeEnd = 0; UChar buffer[16]; UErrorCode err = U_ZERO_ERROR; len = uset_getItem(set, index, &rangeStart, &rangeEnd, buffer, 16, &err); if (len == 0) { int32_t offset = m_rand() % (rangeEnd - rangeStart + 1); UChar32 ch = rangeStart + offset; UnicodeString str(ch); testCase.append(str); generateAlternative(str, alternate); } else if (len > 0) { // should check that len < 16... UnicodeString str(buffer, len); testCase.append(str); generateAlternative(str, alternate); } else { // shouldn't happen... } }
/** * Get a set containing the contractions defined by the collator. The set includes * both the UCA contractions and the contractions defined by the collator * @param coll collator * @param conts the set to hold the result * @param status to hold the error code * @return the size of the contraction set */ U_CAPI int32_t U_EXPORT2 ucol_getContractions( const UCollator *coll, USet *contractions, UErrorCode *status) { ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); return uset_getItemCount(contractions); }
CFCharacterSetRef _CFCreateCharacterSetFromUSet(USet *set) { UErrorCode icuErr = U_ZERO_ERROR; CFMutableCharacterSetRef working = CFCharacterSetCreateMutable(NULL); UChar buffer[2048]; // Suitable for most small sets int32_t stringLen; if (working == NULL) return NULL; int32_t itemCount = uset_getItemCount(set); int32_t i; for (i = 0; i < itemCount; ++i) { UChar32 start, end; UChar * string; string = buffer; stringLen = uset_getItem(set, i, &start, &end, buffer, sizeof(buffer)/sizeof(UChar), &icuErr); if (icuErr == U_BUFFER_OVERFLOW_ERROR) { string = (UChar *) malloc(sizeof(UChar)*(stringLen+1)); if (!string) { CFRelease(working); return NULL; } icuErr = U_ZERO_ERROR; (void) uset_getItem(set, i, &start, &end, string, stringLen+1, &icuErr); } if (U_FAILURE(icuErr)) { if (string != buffer) free(string); CFRelease(working); return NULL; } if (stringLen <= 0) CFCharacterSetAddCharactersInRange(working, CFRangeMake(start, end-start+1)); else { CFStringRef cfString = CFStringCreateWithCharactersNoCopy(kCFAllocatorSystemDefault, (UniChar *)string, stringLen, kCFAllocatorNull); CFCharacterSetAddCharactersInString(working, cfString); CFRelease(cfString); } if (string != buffer) free(string); } CFCharacterSetRef result = CFCharacterSetCreateCopy(kCFAllocatorSystemDefault, working); CFRelease(working); return result; }
U_CDECL_END /** * Get a set containing the contractions defined by the collator. The set includes * both the UCA contractions and the contractions defined by the collator * @param coll collator * @param conts the set to hold the result * @param status to hold the error code * @return the size of the contraction set */ U_CAPI int32_t U_EXPORT2 ucol_getContractions( const UCollator *coll, USet *contractions, UErrorCode *status) { ucol_getContractionsAndExpansions(coll, contractions, NULL, FALSE, status); return uset_getItemCount(contractions); }
// Collator.contractions {{{ static PyObject * icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) { UErrorCode status = U_ZERO_ERROR; UChar *str = NULL; UChar32 start=0, end=0; int32_t count = 0, len = 0, i; PyObject *ans = Py_None, *pbuf; if (self->contractions == NULL) { self->contractions = uset_open(1, 0); if (self->contractions == NULL) return PyErr_NoMemory(); self->contractions = ucol_getTailoredSet(self->collator, &status); } status = U_ZERO_ERROR; count = uset_getItemCount(self->contractions); str = (UChar*)calloc(100, sizeof(UChar)); if (str == NULL) { PyErr_NoMemory(); goto end; } ans = PyTuple_New(count); if (ans == NULL) { goto end; } for (i = 0; i < count; i++) { len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status); if (len >= 2) { // We have a string status = U_ZERO_ERROR; pbuf = icu_to_python(str, len); if (pbuf == NULL) { Py_DECREF(ans); ans = NULL; goto end; } PyTuple_SetItem(ans, i, pbuf); } else { // Ranges dont make sense for contractions, ignore them PyTuple_SetItem(ans, i, Py_None); Py_INCREF(Py_None); } } end: if (str != NULL) free(str); return ans; } // }}}
static void generateSelectorData(UConverterSelector* result, UPropsVectors *upvec, const USet* excludedCodePoints, const UConverterUnicodeSet whichSet, UErrorCode* status) { if (U_FAILURE(*status)) { return; } int32_t columns = (result->encodingsCount+31)/32; // set errorValue to all-ones for (int32_t col = 0; col < columns; col++) { upvec_setValue(upvec, UPVEC_ERROR_VALUE_CP, UPVEC_ERROR_VALUE_CP, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status); } for (int32_t i = 0; i < result->encodingsCount; ++i) { uint32_t mask; uint32_t column; int32_t item_count; int32_t j; UConverter* test_converter = ucnv_open(result->encodings[i], status); if (U_FAILURE(*status)) { return; } USet* unicode_point_set; unicode_point_set = uset_open(1, 0); // empty set ucnv_getUnicodeSet(test_converter, unicode_point_set, whichSet, status); if (U_FAILURE(*status)) { ucnv_close(test_converter); return; } column = i / 32; mask = 1 << (i%32); // now iterate over intervals on set i! item_count = uset_getItemCount(unicode_point_set); for (j = 0; j < item_count; ++j) { UChar32 start_char; UChar32 end_char; UErrorCode smallStatus = U_ZERO_ERROR; uset_getItem(unicode_point_set, j, &start_char, &end_char, NULL, 0, &smallStatus); if (U_FAILURE(smallStatus)) { // this will be reached for the converters that fill the set with // strings. Those should be ignored by our system } else { upvec_setValue(upvec, start_char, end_char, column, static_cast<uint32_t>(~0), mask, status); } } ucnv_close(test_converter); uset_close(unicode_point_set); if (U_FAILURE(*status)) { return; } } // handle excluded encodings! Simply set their values to all 1's in the upvec if (excludedCodePoints) { int32_t item_count = uset_getItemCount(excludedCodePoints); for (int32_t j = 0; j < item_count; ++j) { UChar32 start_char; UChar32 end_char; uset_getItem(excludedCodePoints, j, &start_char, &end_char, NULL, 0, status); for (int32_t col = 0; col < columns; col++) { upvec_setValue(upvec, start_char, end_char, col, static_cast<uint32_t>(~0), static_cast<uint32_t>(~0), status); } } } // alright. Now, let's put things in the same exact form you'd get when you // unserialize things. result->trie = upvec_compactToUTrie2WithRowIndexes(upvec, status); result->pv = upvec_cloneArray(upvec, &result->pvCount, NULL, status); result->pvCount *= columns; // number of uint32_t = rows * columns result->ownPv = TRUE; }
CollData::CollData(UCollator *collator, UErrorCode &status) : coll(NULL), ceToCharsStartingWith(NULL) { // [:c:] == [[:cn:][:cc:][:co:][:cf:][:cs:]] // i.e. other, control, private use, format, surrogate U_STRING_DECL(test_pattern, "[[:assigned:]-[:c:]]", 20); U_STRING_INIT(test_pattern, "[[:assigned:]-[:c:]]", 20); USet *charsToTest = uset_openPattern(test_pattern, 20, &status); // Han ext. A, Han, Jamo, Hangul, Han Ext. B // i.e. all the characers we handle implicitly U_STRING_DECL(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); U_STRING_INIT(remove_pattern, "[[\\u3400-\\u9FFF][\\u1100-\\u11F9][\\uAC00-\\uD7AF][\\U00020000-\\U0002A6DF]]", 70); USet *charsToRemove = uset_openPattern(remove_pattern, 70, &status); if (U_FAILURE(status)) { return; } USet *expansions = uset_openEmpty(); USet *contractions = uset_openEmpty(); int32_t itemCount; ceToCharsStartingWith = new CEToStringsMap(status); if (U_FAILURE(status)) { goto bail; } #ifdef CLONE_COLLATOR coll = ucol_safeClone(collator, NULL, NULL, &status); if (U_FAILURE(status)) { goto bail; } #else coll = collator; #endif ucol_getContractionsAndExpansions(coll, contractions, expansions, FALSE, &status); uset_addAll(charsToTest, contractions); uset_addAll(charsToTest, expansions); uset_removeAll(charsToTest, charsToRemove); itemCount = uset_getItemCount(charsToTest); for(int32_t item = 0; item < itemCount; item += 1) { UChar32 start = 0, end = 0; UChar buffer[16]; int32_t len = uset_getItem(charsToTest, item, &start, &end, buffer, 16, &status); if (len == 0) { for (UChar32 ch = start; ch <= end; ch += 1) { UnicodeString *st = new UnicodeString(ch); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } } else if (len > 0) { UnicodeString *st = new UnicodeString(buffer, len); if (st == NULL) { status = U_MEMORY_ALLOCATION_ERROR; break; } CEList *ceList = new CEList(coll, *st, status); ceToCharsStartingWith->put(ceList->get(0), st, status); delete ceList; delete st; } else { // shouldn't happen... } if (U_FAILURE(status)) { break; } } bail: uset_close(contractions); uset_close(expansions); uset_close(charsToRemove); uset_close(charsToTest); if (U_FAILURE(status)) { return; } UnicodeSet hanRanges(UNICODE_STRING_SIMPLE("[:Unified_Ideograph:]"), status); if (U_FAILURE(status)) { return; } UnicodeSetIterator hanIter(hanRanges); UnicodeString hanString; while(hanIter.nextRange()) { hanString.append(hanIter.getCodepoint()); hanString.append(hanIter.getCodepointEnd()); } // TODO: Why U+11FF? The old code had an outdated UCOL_LAST_T_JAMO=0x11F9, // but as of Unicode 6.3 the 11xx block is filled, // and there are also more Jamo T at U+D7CB..U+D7FB. // Maybe use [:HST=T:] and look for the end of the last range? // Maybe use script boundary mappings instead of this code?? UChar jamoRanges[] = {Hangul::JAMO_L_BASE, Hangul::JAMO_V_BASE, Hangul::JAMO_T_BASE + 1, 0x11FF}; UnicodeString jamoString(FALSE, jamoRanges, UPRV_LENGTHOF(jamoRanges)); CEList hanList(coll, hanString, status); CEList jamoList(coll, jamoString, status); int32_t j = 0; if (U_FAILURE(status)) { return; } for (int32_t c = 0; c < jamoList.size(); c += 1) { uint32_t jce = jamoList[c]; if (! isContinuation(jce)) { jamoLimits[j++] = jce; } } jamoLimits[3] += (1 << UCOL_PRIMARYORDERSHIFT); minHan = 0xFFFFFFFF; maxHan = 0; for(int32_t h = 0; h < hanList.size(); h += 2) { uint32_t han = (uint32_t) hanList[h]; if (han < minHan) { minHan = han; } if (han > maxHan) { maxHan = han; } } maxHan += (1 << UCOL_PRIMARYORDERSHIFT); }
static void expectItems(const USet* set, const char* items) { const char* p = items; UChar ustr[4096], itemStr[4096]; char buf[4096]; char *pat; UErrorCode ec; int32_t expectedSize = 0; int32_t itemCount = uset_getItemCount(set); int32_t itemIndex = 0; UChar32 start = 1, end = 0; int32_t itemLen = 0, length; ec = U_ZERO_ERROR; length = uset_toPattern(set, ustr, sizeof(ustr), TRUE, &ec); if (U_FAILURE(ec)) { log_err("FAIL: uset_toPattern => %s\n", u_errorName(ec)); return; } pat=aescstrdup(ustr, length); if (uset_isEmpty(set) != (strlen(items)==0)) { log_data_err("FAIL: %s should return %s from isEmpty (Are you missing data?)\n", pat, strlen(items)==0 ? "TRUE" : "FALSE"); } /* Don't test patterns starting with "[^" */ if (u_strlen(ustr) > 2 && ustr[1] == 0x5e /*'^'*/) { return; } while (*p) { ++expectedSize; if (start > end || start == -1) { /* Fetch our next item */ if (itemIndex >= itemCount) { log_data_err("FAIL: ran out of items iterating %s (Are you missing data?)\n", pat); return; } itemLen = uset_getItem(set, itemIndex, &start, &end, itemStr, sizeof(itemStr), &ec); if (U_FAILURE(ec) || itemLen < 0) { log_err("FAIL: uset_getItem => %s\n", u_errorName(ec)); return; } if (itemLen == 0) { log_verbose("Ok: %s item %d is %c-%c\n", pat, itemIndex, oneUCharToChar(start), oneUCharToChar(end)); } else { itemStr[itemLen] = 0; u_UCharsToChars(itemStr, buf, itemLen+1); log_verbose("Ok: %s item %d is \"%s\"\n", pat, itemIndex, buf); } ++itemIndex; } if (*p=='{') { const char* stringStart = ++p; int32_t stringLength = 0; char strCopy[64]; while (*p++ != '}') { } stringLength = (int32_t)(p - stringStart - 1); strncpy(strCopy, stringStart, stringLength); strCopy[stringLength] = 0; u_charsToUChars(stringStart, ustr, stringLength); ustr[stringLength] = 0; if (itemLen == 0) { log_err("FAIL: for %s expect \"%s\" next, but got a char\n", pat, strCopy); return; } if (u_strcmp(ustr, itemStr) != 0) { log_err("FAIL: for %s expect \"%s\" next\n", pat, strCopy); return; } } else { UChar32 c; u_charsToUChars(p, ustr, 1); c = ustr[0]; if (itemLen != 0) { log_err("FAIL: for %s expect '%c' next, but got a string\n", pat, *p); return; } if (c != start++) { log_err("FAIL: for %s expect '%c' next\n", pat, *p); return; } ++p; } } if (uset_size(set) == expectedSize) { log_verbose("Ok: %s size is %d\n", pat, expectedSize); } else { log_err("FAIL: %s size is %d, expected %d\n", pat, uset_size(set), expectedSize); } }
// Collator.contractions {{{ static PyObject * icu_Collator_contractions(icu_Collator *self, PyObject *args, PyObject *kwargs) { UErrorCode status = U_ZERO_ERROR; UChar *str; UChar32 start=0, end=0; int32_t count = 0, len = 0, dlen = 0, i; PyObject *ans = Py_None, *pbuf; wchar_t *buf; if (self->contractions == NULL) { self->contractions = uset_open(1, 0); if (self->contractions == NULL) return PyErr_NoMemory(); self->contractions = ucol_getTailoredSet(self->collator, &status); } status = U_ZERO_ERROR; str = (UChar*)calloc(100, sizeof(UChar)); buf = (wchar_t*)calloc(4*100+2, sizeof(wchar_t)); if (str == NULL || buf == NULL) return PyErr_NoMemory(); count = uset_getItemCount(self->contractions); ans = PyTuple_New(count); if (ans != NULL) { for (i = 0; i < count; i++) { len = uset_getItem(self->contractions, i, &start, &end, str, 1000, &status); if (len >= 2) { // We have a string status = U_ZERO_ERROR; u_strToWCS(buf, 4*100 + 1, &dlen, str, len, &status); pbuf = PyUnicode_FromWideChar(buf, dlen); if (pbuf == NULL) return PyErr_NoMemory(); PyTuple_SetItem(ans, i, pbuf); } else { // Ranges dont make sense for contractions, ignore them PyTuple_SetItem(ans, i, Py_None); } } } free(str); free(buf); return Py_BuildValue("O", ans); } // }}}