inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const { ASSERT(m_options & AtWordStarts); if (!start) return true; int size = m_buffer.size(); int offset = start; UChar32 firstCharacter; U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); if (m_options & TreatMedialCapitalAsWordStart) { UChar32 previousCharacter; U16_PREV(m_buffer.data(), 0, offset, previousCharacter); if (isSeparator(firstCharacter)) { // The start of a separator run is a word start (".org" in "webkit.org"). if (!isSeparator(previousCharacter)) return true; } else if (isASCIIUpper(firstCharacter)) { // The start of an uppercase run is a word start ("Kit" in "WebKit"). if (!isASCIIUpper(previousCharacter)) return true; // The last character of an uppercase run followed by a non-separator, non-digit // is a word start ("Request" in "XMLHTTPRequest"). offset = start; U16_FWD_1(m_buffer.data(), offset, size); UChar32 nextCharacter = 0; if (offset < size) U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) return true; } else if (isASCIIDigit(firstCharacter)) { // The start of a digit run is a word start ("2" in "WebKit2"). if (!isASCIIDigit(previousCharacter)) return true; } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { // The start of a non-separator, non-uppercase, non-digit run is a word start, // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). return true; } } // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes // a word, so treat the position before any CJK character as a word start. if (Character::isCJKIdeographOrSymbol(firstCharacter)) return true; size_t wordBreakSearchStart = start + length; while (wordBreakSearchStart > start) wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); if (wordBreakSearchStart != start) return false; if (m_options & WholeWord) return static_cast<int>(start + length) == findWordEndBoundary(m_buffer.data(), m_buffer.size(), wordBreakSearchStart); return true; }
// character_name {{{ static PyObject * icu_character_name(PyObject *self, PyObject *args) { char name[512] = {0}; int32_t sz = 0, alias = 0; UChar *buf; UErrorCode status = U_ZERO_ERROR; PyObject *palias = NULL, *result = NULL, *input = NULL; UChar32 code = 0; if (!PyArg_ParseTuple(args, "O|O", &input, &palias)) return NULL; if (palias != NULL && PyObject_IsTrue(palias)) alias = 1; buf = python_to_icu(input, &sz, 1); if (buf == NULL) goto end; U16_GET(buf, 0, 0, sz, code); if (alias) { sz = u_charName(code, U_CHAR_NAME_ALIAS, name, 511, &status); } else { sz = u_charName(code, U_UNICODE_CHAR_NAME, name, 511, &status); } if (U_FAILURE(status)) { PyErr_SetString(PyExc_ValueError, "Failed to get name for code"); goto end; } result = PyUnicode_DecodeUTF8(name, sz, "strict"); end: if (buf != NULL) free(buf); return result; } // }}}
virtual UChar32 current32() const { if(pos<LENGTHOF(s)) { UChar32 c; U16_GET(s, 0, pos, LENGTHOF(s), c); return c; } else { return DONE; } }
virtual UChar32 setIndex32(int32_t position) { if(0<=position && position<=LENGTHOF(s)) { pos=position; if(pos<LENGTHOF(s)) { UChar32 c; U16_GET(s, 0, pos, LENGTHOF(s), c); return c; } } return DONE; }
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) { UChar32 hc, lc; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; UStringSearch *search = NULL; UErrorCode status = U_ZERO_ERROR; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle search = searches[nidx]; if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct usearch_setOffset(search, hidx, &status); status = U_ZERO_ERROR; pos = usearch_next(search, &status); if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
inline SearchBuffer::SearchBuffer(const String& target, FindOptions options) : m_options(options) , m_prefixLength(0) , m_numberOfCharactersJustAppended(0) , m_atBreak(true) , m_needsMoreContext(options & AtWordStarts) , m_targetRequiresKanaWorkaround(containsKanaLetters(target)) { ASSERT(!target.isEmpty()); target.appendTo(m_target); // FIXME: We'd like to tailor the searcher to fold quote marks for us instead // of doing it in a separate replacement pass here, but ICU doesn't offer a way // to add tailoring on top of the locale-specific tailoring as of this writing. foldQuoteMarksAndSoftHyphens(m_target.data(), m_target.size()); size_t targetLength = m_target.size(); m_buffer.reserveInitialCapacity(std::max(targetLength * 8, minimumSearchBufferSize)); m_overlap = m_buffer.capacity() / 4; if ((m_options & AtWordStarts) && targetLength) { UChar32 targetFirstCharacter; U16_GET(m_target.data(), 0, 0, targetLength, targetFirstCharacter); // Characters in the separator category never really occur at the beginning of a word, // so if the target begins with such a character, we just ignore the AtWordStart option. if (isSeparator(targetFirstCharacter)) { m_options &= ~AtWordStarts; m_needsMoreContext = false; } } // Grab the single global searcher. // If we ever have a reason to do more than once search buffer at once, we'll have // to move to multiple searchers. lockSearcher(); UStringSearch* searcher = blink::searcher(); UCollator* collator = usearch_getCollator(searcher); UCollationStrength strength = m_options & CaseInsensitive ? UCOL_PRIMARY : UCOL_TERTIARY; if (ucol_getStrength(collator) != strength) { ucol_setStrength(collator, strength); usearch_reset(searcher); } UErrorCode status = U_ZERO_ERROR; usearch_setPattern(searcher, m_target.data(), targetLength, &status); ASSERT(status == U_ZERO_ERROR); // The kana workaround requires a normalized copy of the target string. if (m_targetRequiresKanaWorkaround) normalizeCharactersIntoNFCForm(m_target.data(), m_target.size(), m_normalizedTarget); }
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) { UChar32 nc, hc, lc; UChar *p; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } p = u_strchr32(m->haystack + hidx, nc); // TODO: Use primary collation for the find if (p == NULL) { score = 0.0; break; } pos = (int32_t)(p - m->haystack); distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
/* Remove the trailing line terminator from buffer. * buffer line in ICU UChar format * len length of line as number of UChar characters * as given by u_strlen() * Returns number of characters after removing line terminator */ int trim_U16_line_term(UChar *buffer, int len) { int lt_index; UChar32 uc32_cp; if (0 == len) return 0; /* zero length string */ U16_GET(buffer, 0, len - 1, len, uc32_cp); for (lt_index = 0; u32_line_term[lt_index]; lt_index++) if (uc32_cp == u32_line_term[lt_index]) break; if ((U32_LT_LF == lt_index) && (1 < len)) { U16_GET(buffer, 0, len - 2, len, uc32_cp); if (u32_line_term[U32_LT_CR] == uc32_cp) len--; /* trim both CR and LF */ } if (U32_LT_LAST >= lt_index) { buffer[len - 1] = 0; return (len - 1); } return len; /* no line terminator so return it all */ }
Hashtable *CanonicalIterator::getEquivalents2(Hashtable *fillinResult, const UChar *segment, int32_t segLen, UErrorCode &status) { if (U_FAILURE(status)) { return NULL; } //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(segment))); UnicodeString toPut(segment, segLen); fillinResult->put(toPut, new UnicodeString(toPut), status); UnicodeSet starts; // cycle through all the characters UChar32 cp; for (int32_t i = 0; i < segLen; i += U16_LENGTH(cp)) { // see if any character is at the start of some decomposition U16_GET(segment, 0, i, segLen, cp); if (!nfcImpl.getCanonStartSet(cp, starts)) { continue; } // if so, see which decompositions match UnicodeSetIterator iter(starts); while (iter.next()) { UChar32 cp2 = iter.getCodepoint(); Hashtable remainder(status); remainder.setValueDeleter(uprv_deleteUObject); if (extract(&remainder, cp2, segment, segLen, i, status) == NULL) { continue; } // there were some matches, so add all the possibilities to the set. UnicodeString prefix(segment, i); prefix += cp2; int32_t el = UHASH_FIRST; const UHashElement *ne = remainder.nextElement(el); while (ne != NULL) { UnicodeString item = *((UnicodeString *)(ne->value.pointer)); UnicodeString *toAdd = new UnicodeString(prefix); /* test for NULL */ if (toAdd == 0) { status = U_MEMORY_ALLOCATION_ERROR; return NULL; } *toAdd += item; fillinResult->put(*toAdd, toAdd, status); //if (PROGRESS) printf("Adding: %s\n", UToS(Tr(*toAdd))); ne = remainder.nextElement(el); } } } /* Test for buffer overflows */ if(U_FAILURE(status)) { return NULL; } return fillinResult; }
static void TestGetChar() { static UChar input[]={ /* code unit,*/ 0xdc00, 0x20ac, 0xd841, 0x61, 0xd841, 0xdc02, 0xd842, 0xdc06, 0, 0xd842, 0xd7ff, 0xdc41, 0xe000, 0xd800 }; static UChar32 result[]={ /*codepoint-unsafe, codepoint-safe(not strict) codepoint-safe(strict)*/ (UChar32)0xfca10000, 0xdc00, UTF_ERROR_VALUE, 0x20ac, 0x20ac, 0x20ac, 0x12861, 0xd841, UTF_ERROR_VALUE, 0x61, 0x61, 0x61, 0x20402, 0x20402, 0x20402, 0x20402, 0x20402, 0x20402, 0x20806, 0x20806, 0x20806, 0x20806, 0x20806, 0x20806, 0x00, 0x00, 0x00, 0x203ff, 0xd842, UTF_ERROR_VALUE, 0xd7ff, 0xd7ff, 0xd7ff, 0xfc41, 0xdc41, UTF_ERROR_VALUE, 0xe000, 0xe000, 0xe000, 0x11734, 0xd800, UTF_ERROR_VALUE }; uint16_t i=0; UChar32 c; uint16_t offset=0; for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++) { if(0<offset && offset<sizeof(input)/U_SIZEOF_UCHAR-1){ UTF16_GET_CHAR_UNSAFE(input, offset, c); if(c != result[i]){ log_err("ERROR: UTF16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } U16_GET_UNSAFE(input, offset, c); if(c != result[i]){ log_err("ERROR: U16_GET_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } } UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE); if(c != result[i+1]){ log_err("ERROR: UTF16_GET_CHAR_SAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } U16_GET(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c); if(c != result[i+1]){ log_err("ERROR: U16_GET failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } UTF16_GET_CHAR_SAFE(input, 0, offset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE); if(c != result[i+2]){ log_err("ERROR: UTF16_GET_CHAR_SAFE(strict) failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } i=(uint16_t)(i+3); } }
/* keep this in sync with utf8tst.c's TestNulTerminated() */ static void TestNulTerminated() { static const UChar input[]={ /* 0 */ 0x61, /* 1 */ 0xd801, 0xdc01, /* 3 */ 0xdc01, /* 4 */ 0x62, /* 5 */ 0xd801, /* 6 */ 0x00 /* 7 */ }; static const UChar32 result[]={ 0x61, 0x10401, 0xdc01, 0x62, 0xd801, 0 }; UChar32 c, c2; int32_t i0, i=0, j, k, expectedIndex; int32_t cpIndex=0; do { i0=i; U16_NEXT(input, i, -1, c); if(c!=result[cpIndex]) { log_err("U16_NEXT(from %d)=U+%04x != U+%04x\n", i0, c, result[cpIndex]); } j=i0; U16_FWD_1(input, j, -1); if(j!=i) { log_err("U16_FWD_1() moved to index %d but U16_NEXT() moved to %d\n", j, i); } ++cpIndex; /* * Move by this many code points from the start. * U16_FWD_N() stops at the end of the string, that is, at the NUL if necessary. */ expectedIndex= (c==0) ? i-1 : i; k=0; U16_FWD_N(input, k, -1, cpIndex); if(k!=expectedIndex) { log_err("U16_FWD_N(code points from 0) moved to index %d but expected %d\n", k, expectedIndex); } } while(c!=0); i=0; do { j=i0=i; U16_NEXT(input, i, -1, c); do { U16_GET(input, 0, j, -1, c2); if(c2!=c) { log_err("U16_NEXT(from %d)=U+%04x != U+%04x=U16_GET(at %d)\n", i0, c, c2, j); } /* U16_SET_CP_LIMIT moves from a non-lead byte to the limit of the code point */ k=j+1; U16_SET_CP_LIMIT(input, 0, k, -1); if(k!=i) { log_err("U16_NEXT() moved to %d but U16_SET_CP_LIMIT(%d) moved to %d\n", i, j+1, k); } } while(++j<i); } while(c!=0); }