/* this function makes a string with representation of a sortkey */ static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) { int32_t strength = UCOL_PRIMARY; uint32_t res_size = 0; UBool doneCase = FALSE; UErrorCode errorCode = U_ZERO_ERROR; char *current = buffer; const uint8_t *currentSk = sortkey; uprv_strcpy(current, "["); while(strength <= UCOL_QUATERNARY && strength <= ucol_getStrength(coll)) { if(strength > UCOL_PRIMARY) { uprv_strcat(current, " . "); } while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */ uprv_appendByteToHexString(current, *currentSk++); uprv_strcat(current, " "); } if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) { doneCase = TRUE; } else if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) { strength ++; } if (*currentSk) { uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */ } if(strength == UCOL_QUATERNARY && ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &errorCode) == UCOL_NON_IGNORABLE) { break; } } if(ucol_getStrength(coll) == UCOL_IDENTICAL) { uprv_strcat(current, " . "); while(*currentSk != 0) { uprv_appendByteToHexString(current, *currentSk++); uprv_strcat(current, " "); } uprv_appendByteToHexString(current, *currentSk++); } uprv_strcat(current, "]"); if(res_size > *len) { return NULL; } return buffer; }
/* * The collator returned by this function is owned by the callee and must be * closed when this method returns with a U_SUCCESS UErrorCode. * * On error, the return value is undefined. */ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr) { UColAttributeValue strength = ucol_getStrength(pCollator); bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; if (isIgnoreCase) { strength = UCOL_SECONDARY; } if (isIgnoreNonSpace) { strength = UCOL_PRIMARY; } UCollator* pClonedCollator; std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols); if (customRules.empty()) { pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr); } else { int32_t customRuleLength = customRules.size(); int32_t localeRulesLength; const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0'); for (int i = 0; i < localeRulesLength; i++) { completeRules[i] = localeRules[i]; } for (int i = 0; i < customRuleLength; i++) { completeRules[localeRulesLength + i] = customRules[i]; } pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr); } if (isIgnoreSymbols) { ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr); } ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr); // casing differs at the tertiary level. // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On if (strength < UCOL_TERTIARY && !isIgnoreCase) { ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr); } return pClonedCollator; }
int64_t c_Collator::t_getstrength() { if (!m_ucoll) { raise_warning("getstrength called on uninitialized Collator object"); return 0; } return ucol_getStrength(m_ucoll); }
int64 c_Collator::t_getstrength() { INSTANCE_METHOD_INJECTION_BUILTIN(Collator, Collator::getstrength); if (!m_ucoll) { raise_warning("getstrength called on uninitialized Collator object"); return 0; } return ucol_getStrength(m_ucoll); }
CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status) : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0) { UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); UCollationStrength strength = ucol_getStrength(coll); UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; uint32_t variableTop = ucol_getVariableTop(coll, &status); uint32_t strengthMask = 0; int32_t order; if (U_FAILURE(status)) { return; } // **** only set flag if string has Han(gul) **** // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476 switch (strength) { default: strengthMask |= UCOL_TERTIARYORDERMASK; U_FALLTHROUGH; case UCOL_SECONDARY: strengthMask |= UCOL_SECONDARYORDERMASK; U_FALLTHROUGH; case UCOL_PRIMARY: strengthMask |= UCOL_PRIMARYORDERMASK; } ces = ceBuffer; while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) { UBool cont = isContinuation(order); order &= strengthMask; if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) { if (strength >= UCOL_QUATERNARY) { order &= UCOL_PRIMARYORDERMASK; } else { order = UCOL_IGNORABLE; } } if (order == UCOL_IGNORABLE) { continue; } if (cont) { order |= UCOL_CONTINUATION_MARKER; } add(order, status); } ucol_closeElements(elems); }
void TextSearcherICU::setCaseSensitivity(bool caseSensitive) { const UCollationStrength strength = caseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY; UCollator* const collator = usearch_getCollator(m_searcher); if (ucol_getStrength(collator) == strength) return; ucol_setStrength(collator, strength); usearch_reset(m_searcher); }
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) : m_options(options) , m_prefixLength(0) , m_numberOfCharactersJustAppended(0) , m_atBreak(true) , m_needsMoreContext(options & AtWordStarts) , m_targetRequiresKanaWorkaround(containsKanaLetters(target)) { ASSERT(!target.isEmpty()); target.appendTo(m_target); // FIXME: We'd like to tailor the searcher to fold quote marks for us instead // of doing it in a separate replacement pass here, but ICU doesn't offer a way // to add tailoring on top of the locale-specific tailoring as of this writing. foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size()); size_t targetLength = m_target.size(); m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); m_overlap = m_buffer.capacity() / 4; if ((m_options & AtWordStarts) && targetLength) { UChar32 targetFirstCharacter; U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter); // Characters in the separator category never really occur at the beginning of a word, // so if the target begins with such a character, we just ignore the AtWordStart option. if (isSeparator(targetFirstCharacter)) { m_options &= ~AtWordStarts; m_needsMoreContext = false; } } // Grab the single global searcher. // If we ever have a reason to do more than once search buffer at once, we'll have // to move to multiple searchers. lockSearcher(); UStringSearch* searcher = blink::searcher(); UCollator* collator = usearch_getCollator(searcher); UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; if (ucol_getStrength(collator) != strength) { ucol_setStrength(collator, strength); usearch_reset(searcher); } UErrorCode status = U_ZERO_ERROR; usearch_setPattern(searcher, m_target.data(), targetLength, &status); ASSERT(status == U_ZERO_ERROR); // The kana workaround requires a normalized copy of the target string. if (m_targetRequiresKanaWorkaround) normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget); }
int32_t CollationElementIterator::strengthOrder(int32_t order) const { UCollationStrength s = ucol_getStrength(m_data_->iteratordata_.coll); // Mask off the unwanted differences. if (s == UCOL_PRIMARY) { order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY; } else if (s == UCOL_SECONDARY) { order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY; } return order; }
static int icu_collator_strength(lua_State *L) { luaL_argcheck(L, lua_getmetatable(L,1) && lua_rawequal(L,-1,COLLATOR_UV_META), 1, "expecting collator"); lua_pop(L,1); if (lua_gettop(L) == 1) { lua_pushnumber(L, ucol_getStrength(*(UCollator**)lua_touserdata(L,1))); return 1; } else { ucol_setStrength(*(UCollator**)lua_touserdata(L,1), (UCollationStrength)luaL_checknumber(L,2)); lua_settop(L,1); return 1; } }
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status) : bufferSize(0), bufferMin(0), bufferMax(0), strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator), nfd(*Normalizer2Factory::getNFDInstance(status)), targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL) { strength = ucol_getStrength(coll); toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) == UCOL_SHIFTED; variableTop = ucol_getVariableTop(coll, &status); // find the largest expansion uint8_t maxExpansion = 0; for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) { if (*expansion > maxExpansion) { maxExpansion = *expansion; } } // room for an extra character on each end, plus 4 for safety bufferSize = patternLength + (2 * maxExpansion) + 4; ceb = NEW_ARRAY(CEI, bufferSize); if (ceb == NULL) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (target != NULL) { setTargetString(target); } switch (strength) { default: strengthMask |= UCOL_TERTIARYORDERMASK; /* fall through */ case UCOL_SECONDARY: strengthMask |= UCOL_SECONDARYORDERMASK; /* fall through */ case UCOL_PRIMARY: strengthMask |= UCOL_PRIMARYORDERMASK; } }
OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset) : list(NULL), listMax(16), listSize(0) { UErrorCode status = U_ZERO_ERROR; UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status); uint32_t strengthMask = 0; int32_t order, low, high; switch (ucol_getStrength(coll)) { default: strengthMask |= UCOL_TERTIARYORDERMASK; /* fall through */ case UCOL_SECONDARY: strengthMask |= UCOL_SECONDARYORDERMASK; /* fall through */ case UCOL_PRIMARY: strengthMask |= UCOL_PRIMARYORDERMASK; } list = new Order[listMax]; ucol_setOffset(elems, stringOffset, &status); do { low = ucol_getOffset(elems); order = ucol_next(elems, &status); high = ucol_getOffset(elems); if (order != UCOL_NULLORDER) { order &= strengthMask; } if (order != UCOL_IGNORABLE) { add(order, low, high); } } while (order != UCOL_NULLORDER); ucol_closeElements(elems); }
// Collator.strength {{{ static PyObject * icu_Collator_get_strength(icu_Collator *self, void *closure) { return Py_BuildValue("i", ucol_getStrength(self->collator)); }
/* * The collator returned by this function is owned by the callee and must be * closed when this method returns with a U_SUCCESS UErrorCode. * * On error, the return value is undefined. */ UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr) { UColAttributeValue strength = ucol_getStrength(pCollator); bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase; bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace; bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols; if (isIgnoreCase) { strength = UCOL_SECONDARY; } if (isIgnoreNonSpace) { strength = UCOL_PRIMARY; } UCollator* pClonedCollator; std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols); if (customRules.empty()) { pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr); } else { int32_t customRuleLength = customRules.size(); int32_t localeRulesLength; const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength); std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0'); for (int i = 0; i < localeRulesLength; i++) { completeRules[i] = localeRules[i]; } for (int i = 0; i < customRuleLength; i++) { completeRules[localeRulesLength + i] = customRules[i]; } pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr); } if (isIgnoreSymbols) { ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr); // by default, ICU alternate shifted handling only ignores punctuation, but // IgnoreSymbols needs symbols and currency as well, so change the "variable top" // to include all symbols and currency #if HAVE_SET_MAX_VARIABLE ucol_setMaxVariable(pClonedCollator, UCOL_REORDER_CODE_CURRENCY, pErr); #else // 0xfdfc is the last currency character before the first digit character // in http://source.icu-project.org/repos/icu/icu/tags/release-52-1/source/data/unidata/FractionalUCA.txt const UChar ignoreSymbolsVariableTop[] = { 0xfdfc }; ucol_setVariableTop(pClonedCollator, ignoreSymbolsVariableTop, 1, pErr); #endif } ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr); // casing differs at the tertiary level. // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On if (strength < UCOL_TERTIARY && !isIgnoreCase) { ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr); } return pClonedCollator; }
static int64_t HHVM_METHOD(Collator, getStrength) { FETCH_COL(data, this_, false); return ucol_getStrength(data->collator()); }
/** * call-seq: * collator.strength * * Get the collation strength used in a UCollator. The strength influences how strings are compared. **/ VALUE icu4r_col_get_strength(VALUE self) { return INT2NUM(ucol_getStrength(UCOLLATOR(self))); }