Пример #1
0
inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
{
    ASSERT(m_options & AtWordStarts);

    if (!start)
        return true;

    int size = m_buffer.size();
    int offset = start;
    UChar32 firstCharacter;
    U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);

    if (m_options & TreatMedialCapitalAsWordStart) {
        UChar32 previousCharacter;
        U16_PREV(m_buffer.data(), 0, offset, previousCharacter);

        if (isSeparator(firstCharacter)) {
            // The start of a separator run is a word start (".org" in "webkit.org").
            if (!isSeparator(previousCharacter))
                return true;
        } else if (isASCIIUpper(firstCharacter)) {
            // The start of an uppercase run is a word start ("Kit" in "WebKit").
            if (!isASCIIUpper(previousCharacter))
                return true;
            // The last character of an uppercase run followed by a non-separator, non-digit
            // is a word start ("Request" in "XMLHTTPRequest").
            offset = start;
            U16_FWD_1(m_buffer.data(), offset, size);
            UChar32 nextCharacter = 0;
            if (offset < size)
                U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
            if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
                return true;
        } else if (isASCIIDigit(firstCharacter)) {
            // The start of a digit run is a word start ("2" in "WebKit2").
            if (!isASCIIDigit(previousCharacter))
                return true;
        } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
            // The start of a non-separator, non-uppercase, non-digit run is a word start,
            // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
            return true;
        }
    }

    // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
    // a word, so treat the position before any CJK character as a word start.
    if (Character::isCJKIdeographOrSymbol(firstCharacter))
        return true;

    size_t wordBreakSearchStart = start + length;
    while (wordBreakSearchStart > start)
        wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
    if (wordBreakSearchStart != start)
        return false;
    if (m_options & WholeWord)
        return static_cast<int>(start + length) == findWordEndBoundary(m_buffer.data(), m_buffer.size(), wordBreakSearchStart);
    return true;
}
Пример #2
0
// character_name {{{
static PyObject *
icu_character_name(PyObject *self, PyObject *args) {
    char name[512] = {0}; 
    int32_t sz = 0, alias = 0;
    UChar *buf;
    UErrorCode status = U_ZERO_ERROR;
    PyObject *palias = NULL, *result = NULL, *input = NULL;
    UChar32 code = 0;
  
    if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL;

    if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; 
    buf = python_to_icu(input, &sz, 1);
    if (buf == NULL) goto end; 
    U16_GET(buf, 0, 0, sz, code);
    if (alias) {
        sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status);
    } else {
        sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status);
    }
    if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; }
    result = PyUnicode_DecodeUTF8(name, sz, "strict");
end:
    if (buf != NULL) free(buf);

    return result;
} // }}}
Пример #3
0
 virtual UChar32 current32() const {
     if(pos<LENGTHOF(s)) {
         UChar32 c;
         U16_GET(s, 0, pos, LENGTHOF(s), c);
         return c;
     } else {
         return DONE;
     }
 }
Пример #4
0
 virtual UChar32 setIndex32(int32_t position) {
     if(0<=position && position<=LENGTHOF(s)) {
         pos=position;
         if(pos<LENGTHOF(s)) {
             UChar32 c;
             U16_GET(s, 0, pos, LENGTHOF(s), c);
             return c;
         }
     }
     return DONE;
 }
Пример #5
0
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) {
    UChar32 hc, lc;
    double final_score = 0.0, score = 0.0, score_for_char = 0.0;
    int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
    MemoryItem mem = {0};
    UStringSearch *search = NULL;
    UErrorCode status = U_ZERO_ERROR;

    stack_push(stack, 0, 0, 0, 0.0, final_positions);

    while (stack->pos >= 0) {
        stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
        mem = m->memo[hidx][nidx][last_idx];
        if (mem.score == DBL_MAX) {
            // No memoized result, calculate the score
            for (i = nidx; i < m->needle_len;) {
                nidx = i;
                U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle 
                search = searches[nidx];
                if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
                status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct
                usearch_setOffset(search, hidx, &status);
                status = U_ZERO_ERROR;
                pos = usearch_next(search, &status);
                if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found
                distance = u_countChar32(m->haystack + last_idx, pos - last_idx);  
                if (distance <= 1) score_for_char = m->max_score_per_char;
                else {
                    U16_GET(m->haystack, 0, pos, m->haystack_len, hc); 
                    j = pos;
                    U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
                    score_for_char = calc_score_for_char(m, lc, hc, distance);
                }
                j = pos;
                U16_NEXT(m->haystack, j, m->haystack_len, hc); 
                hidx = j;
                if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
                last_idx = pos; 
                positions[nidx] = pos; 
                score += score_for_char;
            } // for(i) iterate over needle
            mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);

        } else {
            score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
        }
        // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
        if (score > final_score) {
            final_score = score;
            memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
        }
    }
    return final_score;
}
Пример #6
0
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options)
    : m_options(options)
    , m_prefixLength(0)
    , m_numberOfCharactersJustAppended(0)
    , m_atBreak(true)
    , m_needsMoreContext(options & AtWordStarts)
    , m_targetRequiresKanaWorkaround(containsKanaLetters(target))
{
    ASSERT(!target.isEmpty());
    target.appendTo(m_target);

    // FIXME: We'd like to tailor the searcher to fold quote marks for us instead
    // of doing it in a separate replacement pass here, but ICU doesn't offer a way
    // to add tailoring on top of the locale-specific tailoring as of this writing.
    foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size());

    size_t targetLength = m_target.size();
    m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize));
    m_overlap = m_buffer.capacity() / 4;

    if ((m_options & AtWordStarts) && targetLength) {
        UChar32 targetFirstCharacter;
        U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter);
        // Characters in the separator category never really occur at the beginning of a word,
        // so if the target begins with such a character, we just ignore the AtWordStart option.
        if (isSeparator(targetFirstCharacter)) {
            m_options &= ~AtWordStarts;
            m_needsMoreContext = false;
        }
    }

    // Grab the single global searcher.
    // If we ever have a reason to do more than once search buffer at once, we'll have
    // to move to multiple searchers.
    lockSearcher();

    UStringSearch* searcher = blink::searcher();
    UCollator* collator = usearch_getCollator(searcher);

    UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY;
    if (ucol_getStrength(collator) != strength) {
        ucol_setStrength(collator, strength);
        usearch_reset(searcher);
    }

    UErrorCode status = U_ZERO_ERROR;
    usearch_setPattern(searcher, m_target.data(), targetLength, &status);
    ASSERT(status == U_ZERO_ERROR);

    // The kana workaround requires a normalized copy of the target string.
    if (m_targetRequiresKanaWorkaround)
        normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget);
}
Пример #7
0
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) {
    UChar32 nc, hc, lc;
    UChar *p;
    double final_score = 0.0, score = 0.0, score_for_char = 0.0;
    int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
    MemoryItem mem = {0};

    stack_push(stack, 0, 0, 0, 0.0, final_positions);

    while (stack->pos >= 0) {
        stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
        mem = m->memo[hidx][nidx][last_idx];
        if (mem.score == DBL_MAX) {
            // No memoized result, calculate the score
            for (i = nidx; i < m->needle_len;) {
                nidx = i;
                U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle 
                if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
                p = u_strchr32(m->haystack + hidx, nc);  // TODO: Use primary collation for the find
                if (p == NULL) { score = 0.0; break; }
                pos = (int32_t)(p - m->haystack);
                distance = u_countChar32(m->haystack + last_idx, pos - last_idx);  
                if (distance <= 1) score_for_char = m->max_score_per_char;
                else {
                    U16_GET(m->haystack, 0, pos, m->haystack_len, hc); 
                    j = pos;
                    U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
                    score_for_char = calc_score_for_char(m, lc, hc, distance);
                }
                j = pos;
                U16_NEXT(m->haystack, j, m->haystack_len, hc); 
                hidx = j;
                if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
                last_idx = pos; 
                positions[nidx] = pos; 
                score += score_for_char;
            } // for(i) iterate over needle
            mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);

        } else {
            score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
        }
        // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
        if (score > final_score) {
            final_score = score;
            memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
        }
    }
    return final_score;
}
Пример #8
0
/* Remove the trailing line terminator from buffer.
 * buffer	line in ICU UChar format
 * len		length of line as number of UChar characters
 * 			as given by u_strlen()
 * Returns	number of characters after removing line terminator
 */
int trim_U16_line_term(UChar *buffer, int len)
{
	int	lt_index;
	UChar32	uc32_cp;

	if (0 == len)
		return 0;		/* zero length string */
	U16_GET(buffer, 0, len - 1, len, uc32_cp);
	for (lt_index = 0; u32_line_term[lt_index]; lt_index++)
		if (uc32_cp == u32_line_term[lt_index])
			break;
	if ((U32_LT_LF == lt_index) && (1 < len))
	{
		U16_GET(buffer, 0, len - 2, len, uc32_cp);
		if (u32_line_term[U32_LT_CR] == uc32_cp)
			len--;		/* trim both CR and LF */
	}
	if (U32_LT_LAST >= lt_index)
	{
		buffer[len - 1] = 0;
		return (len - 1);
	}
	return len;		/* no line terminator so return it all */
}
Пример #9
0
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) {

    if (U_FAILURE(status)) {
        return NULL;
    }

    //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment)));

    UnicodeString toPut(segment, segLen);

    fillinResult->put(toPut, new UnicodeString(toPut), status);

    UnicodeSet starts;

    // cycle through all the characters
    UChar32 cp;
    for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) {
        // see if any character is at the start of some decomposition
        U16_GET(segment, 0, i, segLen, cp);
        if (!nfcImpl.getCanonStartSet(cp, starts)) {
            continue;
        }
        // if so, see which decompositions match
        UnicodeSetIterator iter(starts);
        while (iter.next()) {
            UChar32 cp2 = iter.getCodepoint();
            Hashtable remainder(status);
            remainder.setValueDeleter(uprv_deleteUObject);
            if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) {
                continue;
            }

            // there were some matches, so add all the possibilities to the set.
            UnicodeString prefix(segment, i);
            prefix += cp2;

            int32_t el = UHASH_FIRST;
            const UHashElement *ne = remainder.nextElement(el);
            while (ne != NULL) {
                UnicodeString item = *((UnicodeString *)(ne->value.pointer));
                UnicodeString *toAdd = new UnicodeString(prefix);
                /* test for NULL */
                if (toAdd == 0) {
                    status = U_MEMORY_ALLOCATION_ERROR;
                    return NULL;
                }
                *toAdd += item;
                fillinResult->put(*toAdd, toAdd, status);

                //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd)));

                ne = remainder.nextElement(el);
            }
        }
    }

    /* Test for buffer overflows */
    if(U_FAILURE(status)) {
        return NULL;
    }
    return fillinResult;
}
Пример #10
0
static void TestGetChar()
{
    static UChar input[]={
    /*  code unit,*/
        0xdc00,
        0x20ac,    
        0xd841,      
        0x61,      
        0xd841,      
        0xdc02,     
        0xd842,    
        0xdc06,     
        0,
        0xd842,
        0xd7ff,
        0xdc41,
        0xe000,
        0xd800
    };
    static UChar32 result[]={
     /*codepoint-unsafe,  codepoint-safe(not strict)  codepoint-safe(strict)*/
        (UChar32)0xfca10000, 0xdc00,                  UTF_ERROR_VALUE,   
        0x20ac,           0x20ac,                     0x20ac,
        0x12861,          0xd841,                     UTF_ERROR_VALUE,
        0x61,             0x61,                       0x61, 
        0x20402,          0x20402,                    0x20402,  
        0x20402,          0x20402,                    0x20402,
        0x20806,          0x20806,                    0x20806,
        0x20806,          0x20806,                    0x20806,
        0x00,             0x00,                       0x00,
        0x203ff,          0xd842,                     UTF_ERROR_VALUE,
        0xd7ff,           0xd7ff,                     0xd7ff,  
        0xfc41,           0xdc41,                     UTF_ERROR_VALUE,
        0xe000,           0xe000,                     0xe000, 
        0x11734,          0xd800,                     UTF_ERROR_VALUE  
    };
    uint16_t i=0;
    UChar32 c;
    uint16_t offset=0;
    for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++) {
        if(0<offset && offset<sizeof(input)/U_SIZEOF_UCHAR-1){
            UTF16_GET_CHAR_UNSAFE(input, offset, c);
            if(c != result[i]){
                log_err("ERROR: UTF16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
            }

            U16_GET_UNSAFE(input, offset, c);
            if(c != result[i]){
                log_err("ERROR: U16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
            }
        }

        UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
        if(c != result[i+1]){
            log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
        }

        U16_GET(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c);
        if(c != result[i+1]){
            log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
        }

        UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
        if(c != result[i+2]){
            log_err("ERROR: UTF16_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
        }
        i=(uint16_t)(i+3);
    }

}
Пример #11
0
/* keep this in sync with utf8tst.c's TestNulTerminated() */
static void TestNulTerminated() {
    static const UChar input[]={
        /*  0 */  0x61,
        /*  1 */  0xd801, 0xdc01,
        /*  3 */  0xdc01,
        /*  4 */  0x62,
        /*  5 */  0xd801,
        /*  6 */  0x00
        /*  7 */
    };
    static const UChar32 result[]={
        0x61,
        0x10401,
        0xdc01,
        0x62,
        0xd801,
        0
    };

    UChar32 c, c2;
    int32_t i0, i=0, j, k, expectedIndex;
    int32_t cpIndex=0;
    do {
        i0=i;
        U16_NEXT(input, i, -1, c);
        if(c!=result[cpIndex]) {
            log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]);
        }
        j=i0;
        U16_FWD_1(input, j, -1);
        if(j!=i) {
            log_err("U16_FWD_1() moved to index %d but U16_NEXT() moved to %d\n", j, i);
        }
        ++cpIndex;
        /*
         * Move by this many code points from the start.
         * U16_FWD_N() stops at the end of the string, that is, at the NUL if necessary.
         */
        expectedIndex= (c==0) ? i-1 : i;
        k=0;
        U16_FWD_N(input, k, -1, cpIndex);
        if(k!=expectedIndex) {
            log_err("U16_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex);
        }
    } while(c!=0);

    i=0;
    do {
        j=i0=i;
        U16_NEXT(input, i, -1, c);
        do {
            U16_GET(input, 0, j, -1, c2);
            if(c2!=c) {
                log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j);
            }
            /* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */
            k=j+1;
            U16_SET_CP_LIMIT(input, 0, k, -1);
            if(k!=i) {
                log_err("U16_NEXT() moved to %d but U16_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k);
            }
        } while(++j<i);
    } while(c!=0);
}