Beispiel #1
0
/* this function makes a string with representation of a sortkey */
static char* U_EXPORT2 sortKeyToString(const UCollator *coll, const uint8_t *sortkey, char *buffer, uint32_t *len) {
    int32_t strength = UCOL_PRIMARY;
    uint32_t res_size = 0;
    UBool doneCase = FALSE;
    UErrorCode errorCode = U_ZERO_ERROR;

    char *current = buffer;
    const uint8_t *currentSk = sortkey;

    uprv_strcpy(current, "[");

    while(strength <= UCOL_QUATERNARY && strength <= ucol_getStrength(coll)) {
        if(strength > UCOL_PRIMARY) {
            uprv_strcat(current, " . ");
        }
        while(*currentSk != 0x01 && *currentSk != 0x00) { /* print a level */
            uprv_appendByteToHexString(current, *currentSk++);
            uprv_strcat(current, " ");
        }
        if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_ON && strength == UCOL_SECONDARY && doneCase == FALSE) {
            doneCase = TRUE;
        } else if(ucol_getAttribute(coll, UCOL_CASE_LEVEL, &errorCode) == UCOL_OFF || doneCase == TRUE || strength != UCOL_SECONDARY) {
            strength ++;
        }
        if (*currentSk) {
            uprv_appendByteToHexString(current, *currentSk++); /* This should print '01' */
        }
        if(strength == UCOL_QUATERNARY && ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &errorCode) == UCOL_NON_IGNORABLE) {
            break;
        }
    }

    if(ucol_getStrength(coll) == UCOL_IDENTICAL) {
        uprv_strcat(current, " . ");
        while(*currentSk != 0) {
            uprv_appendByteToHexString(current, *currentSk++);
            uprv_strcat(current, " ");
        }

        uprv_appendByteToHexString(current, *currentSk++);
    }
    uprv_strcat(current, "]");

    if(res_size > *len) {
        return NULL;
    }

    return buffer;
}
Beispiel #2
0
/*
 * The collator returned by this function is owned by the callee and must be
 * closed when this method returns with a U_SUCCESS UErrorCode.
 *
 * On error, the return value is undefined.
 */
UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr)
{
    UColAttributeValue strength = ucol_getStrength(pCollator);

    bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase;
    bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace;
    bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols;

    if (isIgnoreCase)
    {
        strength = UCOL_SECONDARY;
    }

    if (isIgnoreNonSpace)
    {
        strength = UCOL_PRIMARY;
    }

    UCollator* pClonedCollator;
    std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols);
    if (customRules.empty())
    {
        pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);
    }
    else
    {
        int32_t customRuleLength = customRules.size();

        int32_t localeRulesLength;
        const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength);

        std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0');
        for (int i = 0; i < localeRulesLength; i++)
        {
            completeRules[i] = localeRules[i];
        }
        for (int i = 0; i < customRuleLength; i++)
        {
            completeRules[localeRulesLength + i] = customRules[i];
        }

        pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr);
    }

    if (isIgnoreSymbols)
    {
        ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr);
    }

    ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr);

    // casing differs at the tertiary level.
    // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On
    if (strength < UCOL_TERTIARY && !isIgnoreCase)
    {
        ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr);
    }

    return pClonedCollator;
}
Beispiel #3
0
int64_t c_Collator::t_getstrength() {
  if (!m_ucoll) {
    raise_warning("getstrength called on uninitialized Collator object");
    return 0;
  }
  return ucol_getStrength(m_ucoll);
}
Beispiel #4
0
int64 c_Collator::t_getstrength() {
  INSTANCE_METHOD_INJECTION_BUILTIN(Collator, Collator::getstrength);
  if (!m_ucoll) {
    raise_warning("getstrength called on uninitialized Collator object");
    return 0;
  }
  return ucol_getStrength(m_ucoll);
}
Beispiel #5
0
CEList::CEList(UCollator *coll, const UnicodeString &string, UErrorCode &status)
    : ces(NULL), listMax(CELIST_BUFFER_SIZE), listSize(0)
{
    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
    UCollationStrength strength = ucol_getStrength(coll);
    UBool toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
    uint32_t variableTop = ucol_getVariableTop(coll, &status);
    uint32_t strengthMask = 0;
    int32_t order;

    if (U_FAILURE(status)) {
        return;
    }

    // **** only set flag if string has Han(gul) ****
    // ucol_forceHanImplicit(elems, &status); -- removed for ticket #10476

    switch (strength)
    {
    default:
        strengthMask |= UCOL_TERTIARYORDERMASK;
        U_FALLTHROUGH;
    case UCOL_SECONDARY:
        strengthMask |= UCOL_SECONDARYORDERMASK;
        U_FALLTHROUGH;
    case UCOL_PRIMARY:
        strengthMask |= UCOL_PRIMARYORDERMASK;
    }

    ces = ceBuffer;

    while ((order = ucol_next(elems, &status)) != UCOL_NULLORDER) {
        UBool cont = isContinuation(order);

        order &= strengthMask;

        if (toShift && variableTop > (uint32_t)order && (order & UCOL_PRIMARYORDERMASK) != 0) {
            if (strength >= UCOL_QUATERNARY) {
                order &= UCOL_PRIMARYORDERMASK;
            } else {
                order = UCOL_IGNORABLE;
            }
        }

        if (order == UCOL_IGNORABLE) {
            continue;
        }

        if (cont) {
            order |= UCOL_CONTINUATION_MARKER;
        }

        add(order, status);
    }

    ucol_closeElements(elems);
}
Beispiel #6
0
void TextSearcherICU::setCaseSensitivity(bool caseSensitive) {
  const UCollationStrength strength =
      caseSensitive ? UCOL_TERTIARY : UCOL_PRIMARY;

  UCollator* const collator = usearch_getCollator(m_searcher);
  if (ucol_getStrength(collator) == strength)
    return;

  ucol_setStrength(collator, strength);
  usearch_reset(m_searcher);
}
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
    : m_options(options)
    , m_prefixLength(0)
    , m_numberOfCharactersJustAppended(0)
    , m_atBreak(true)
    , m_needsMoreContext(options & AtWordStarts)
    , m_targetRequiresKanaWorkaround(containsKanaLetters(target))
{
    ASSERT(!target.isEmpty());
    target.appendTo(m_target);

    // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
    // of doing it in a separate replacement pass here, but ICU doesn't offer a way
    // to add tailoring on top of the locale-specific tailoring as of this writing.
    foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size());

    size_t targetLength = m_target.size();
    m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize));
    m_overlap = m_buffer.capacity() / 4;

    if ((m_options & AtWordStarts) && targetLength) {
        UChar32 targetFirstCharacter;
        U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter);
        // Characters in the separator category never really occur at the beginning of a word,
        // so if the target begins with such a character, we just ignore the AtWordStart option.
        if (isSeparator(targetFirstCharacter)) {
            m_options &= ~AtWordStarts;
            m_needsMoreContext = false;
        }
    }

    // Grab the single global searcher.
    // If we ever have a reason to do more than once search buffer at once, we'll have
    // to move to multiple searchers.
    lockSearcher();

    UStringSearch* searcher = blink::searcher();
    UCollator* collator = usearch_getCollator(searcher);

    UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
    if (ucol_getStrength(collator) != strength) {
        ucol_setStrength(collator, strength);
        usearch_reset(searcher);
    }

    UErrorCode status = U_ZERO_ERROR;
    usearch_setPattern(searcher, m_target.data(), targetLength, &status);
    ASSERT(status == U_ZERO_ERROR);

    // The kana workaround requires a normalized copy of the target string.
    if (m_targetRequiresKanaWorkaround)
        normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget);
}
Beispiel #8
0
int32_t CollationElementIterator::strengthOrder(int32_t order) const
{
    UCollationStrength s = ucol_getStrength(m_data_->iteratordata_.coll);
    // Mask off the unwanted differences.
    if (s == UCOL_PRIMARY) {
        order &= RuleBasedCollator::PRIMARYDIFFERENCEONLY;
    }
    else if (s == UCOL_SECONDARY) {
        order &= RuleBasedCollator::SECONDARYDIFFERENCEONLY;
    }

    return order;
}
Beispiel #9
0
static int icu_collator_strength(lua_State *L) {
	luaL_argcheck(L, lua_getmetatable(L,1) && lua_rawequal(L,-1,COLLATOR_UV_META), 1, "expecting collator");
	lua_pop(L,1);
	if (lua_gettop(L) == 1) {
		lua_pushnumber(L, ucol_getStrength(*(UCollator**)lua_touserdata(L,1)));
		return 1;
	}
	else {
		ucol_setStrength(*(UCollator**)lua_touserdata(L,1), (UCollationStrength)luaL_checknumber(L,2));
		lua_settop(L,1);
		return 1;
	}
}
Beispiel #10
0
Target::Target(UCollator *theCollator, const UnicodeString *target, int32_t patternLength, UErrorCode &status)
    : bufferSize(0), bufferMin(0), bufferMax(0),
      strengthMask(0), strength(UCOL_PRIMARY), variableTop(0), toShift(FALSE), coll(theCollator),
      nfd(*Normalizer2Factory::getNFDInstance(status)),
      targetString(NULL), targetBuffer(NULL), targetLength(0), elements(NULL), charBreakIterator(NULL)
{
    strength = ucol_getStrength(coll);
    toShift = ucol_getAttribute(coll, UCOL_ALTERNATE_HANDLING, &status) ==  UCOL_SHIFTED;
    variableTop = ucol_getVariableTop(coll, &status);

    // find the largest expansion
    uint8_t maxExpansion = 0;
    for (const uint8_t *expansion = coll->expansionCESize; *expansion != 0; expansion += 1) {
        if (*expansion > maxExpansion) {
            maxExpansion = *expansion;
        }
    }

    // room for an extra character on each end, plus 4 for safety
    bufferSize = patternLength + (2 * maxExpansion) + 4;

    ceb = NEW_ARRAY(CEI, bufferSize);

    if (ceb == NULL) {
        status = U_MEMORY_ALLOCATION_ERROR;
        return;
    }

    if (target != NULL) {
        setTargetString(target);
    }

    switch (strength)
    {
    default:
        strengthMask |= UCOL_TERTIARYORDERMASK;
        /* fall through */

    case UCOL_SECONDARY:
        strengthMask |= UCOL_SECONDARYORDERMASK;
        /* fall through */

    case UCOL_PRIMARY:
        strengthMask |= UCOL_PRIMARYORDERMASK;
    }
}
Beispiel #11
0
OrderList::OrderList(UCollator *coll, const UnicodeString &string, int32_t stringOffset)
    : list(NULL), listMax(16), listSize(0)
{
    UErrorCode status = U_ZERO_ERROR;
    UCollationElements *elems = ucol_openElements(coll, string.getBuffer(), string.length(), &status);
    uint32_t strengthMask = 0;
    int32_t order, low, high;

    switch (ucol_getStrength(coll))
    {
    default:
        strengthMask |= UCOL_TERTIARYORDERMASK;
        /* fall through */

    case UCOL_SECONDARY:
        strengthMask |= UCOL_SECONDARYORDERMASK;
        /* fall through */

    case UCOL_PRIMARY:
        strengthMask |= UCOL_PRIMARYORDERMASK;
    }

    list = new Order[listMax];

    ucol_setOffset(elems, stringOffset, &status);

    do {
        low   = ucol_getOffset(elems);
        order = ucol_next(elems, &status);
        high  = ucol_getOffset(elems);

        if (order != UCOL_NULLORDER) {
            order &= strengthMask;
        }

        if (order != UCOL_IGNORABLE) {
            add(order, low, high);
        }
    } while (order != UCOL_NULLORDER);

    ucol_closeElements(elems);
}
Beispiel #12
0
// Collator.strength {{{
static PyObject *
icu_Collator_get_strength(icu_Collator *self, void *closure) {
    return Py_BuildValue("i", ucol_getStrength(self->collator));
}
Beispiel #13
0
/*
 * The collator returned by this function is owned by the callee and must be
 * closed when this method returns with a U_SUCCESS UErrorCode.
 *
 * On error, the return value is undefined.
 */
UCollator* CloneCollatorWithOptions(const UCollator* pCollator, int32_t options, UErrorCode* pErr)
{
    UColAttributeValue strength = ucol_getStrength(pCollator);

    bool isIgnoreCase = (options & CompareOptionsIgnoreCase) == CompareOptionsIgnoreCase;
    bool isIgnoreNonSpace = (options & CompareOptionsIgnoreNonSpace) == CompareOptionsIgnoreNonSpace;
    bool isIgnoreSymbols = (options & CompareOptionsIgnoreSymbols) == CompareOptionsIgnoreSymbols;

    if (isIgnoreCase)
    {
        strength = UCOL_SECONDARY;
    }

    if (isIgnoreNonSpace)
    {
        strength = UCOL_PRIMARY;
    }

    UCollator* pClonedCollator;
    std::vector<UChar> customRules = GetCustomRules(options, strength, isIgnoreSymbols);
    if (customRules.empty())
    {
        pClonedCollator = ucol_safeClone(pCollator, nullptr, nullptr, pErr);
    }
    else
    {
        int32_t customRuleLength = customRules.size();

        int32_t localeRulesLength;
        const UChar* localeRules = ucol_getRules(pCollator, &localeRulesLength);

        std::vector<UChar> completeRules(localeRulesLength + customRuleLength + 1, '\0');
        for (int i = 0; i < localeRulesLength; i++)
        {
            completeRules[i] = localeRules[i];
        }
        for (int i = 0; i < customRuleLength; i++)
        {
            completeRules[localeRulesLength + i] = customRules[i];
        }

        pClonedCollator = ucol_openRules(completeRules.data(), completeRules.size(), UCOL_DEFAULT, strength, NULL, pErr);
    }

    if (isIgnoreSymbols)
    {
        ucol_setAttribute(pClonedCollator, UCOL_ALTERNATE_HANDLING, UCOL_SHIFTED, pErr);

        // by default, ICU alternate shifted handling only ignores punctuation, but
        // IgnoreSymbols needs symbols and currency as well, so change the "variable top"
        // to include all symbols and currency
#if HAVE_SET_MAX_VARIABLE
        ucol_setMaxVariable(pClonedCollator, UCOL_REORDER_CODE_CURRENCY, pErr);
#else
        // 0xfdfc is the last currency character before the first digit character
        // in http://source.icu-project.org/repos/icu/icu/tags/release-52-1/source/data/unidata/FractionalUCA.txt
        const UChar ignoreSymbolsVariableTop[] = { 0xfdfc };
        ucol_setVariableTop(pClonedCollator, ignoreSymbolsVariableTop, 1, pErr);
#endif
    }

    ucol_setAttribute(pClonedCollator, UCOL_STRENGTH, strength, pErr);

    // casing differs at the tertiary level.
    // if strength is less than tertiary, but we are not ignoring case, then we need to flip CASE_LEVEL On
    if (strength < UCOL_TERTIARY && !isIgnoreCase)
    {
        ucol_setAttribute(pClonedCollator, UCOL_CASE_LEVEL, UCOL_ON, pErr);
    }

    return pClonedCollator;
}
Beispiel #14
0
static int64_t HHVM_METHOD(Collator, getStrength) {
  FETCH_COL(data, this_, false);
  return ucol_getStrength(data->collator());
}
Beispiel #15
0
/**
 * call-seq:
 *     collator.strength 
 *
 * Get the collation strength used in a UCollator. The strength influences how strings are compared.
 **/
VALUE icu4r_col_get_strength(VALUE self)
{
    return INT2NUM(ucol_getStrength(UCOLLATOR(self)));
}