static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) { UChar32 hc, lc; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; UStringSearch *search = NULL; UErrorCode status = U_ZERO_ERROR; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle search = searches[nidx]; if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct usearch_setOffset(search, hidx, &status); status = U_ZERO_ERROR; pos = usearch_next(search, &status); if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
void StringSearch::setOffset(int32_t position, UErrorCode &status) { // status checked in usearch_setOffset usearch_setOffset(m_strsrch_, position, &status); }
inline size_t SearchBuffer::search(size_t& start) { size_t size = m_buffer.size(); if (m_atBreak) { if (!size) return 0; } else { if (size != m_buffer.capacity()) return 0; } UStringSearch* searcher = blink::searcher(); UErrorCode status = U_ZERO_ERROR; usearch_setText(searcher, m_buffer.data(), size, &status); ASSERT(status == U_ZERO_ERROR); usearch_setOffset(searcher, m_prefixLength, &status); ASSERT(status == U_ZERO_ERROR); int matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); nextMatch: if (!(matchStart >= 0 && static_cast<size_t>(matchStart) < size)) { ASSERT(matchStart == USEARCH_DONE); return 0; } // Matches that start in the overlap area are only tentative. // The same match may appear later, matching more characters, // possibly including a combining character that's not yet in the buffer. if (!m_atBreak && static_cast<size_t>(matchStart) >= size - m_overlap) { size_t overlap = m_overlap; if (m_options & AtWordStarts) { // Ensure that there is sufficient context before matchStart the next time around for // determining if it is at a word boundary. int wordBoundaryContextStart = matchStart; U16_BACK_1(m_buffer.data(), 0, wordBoundaryContextStart); wordBoundaryContextStart = startOfLastWordBoundaryContext(m_buffer.data(), wordBoundaryContextStart); overlap = std::min(size - 1, std::max(overlap, size - wordBoundaryContextStart)); } memcpy(m_buffer.data(), m_buffer.data() + size - overlap, overlap * sizeof(UChar)); m_prefixLength -= std::min(m_prefixLength, size - overlap); m_buffer.shrink(overlap); return 0; } size_t matchedLength = usearch_getMatchedLength(searcher); ASSERT_WITH_SECURITY_IMPLICATION(matchStart + matchedLength <= size); // If this match is "bad", move on to the next match. if (isBadMatch(m_buffer.data() + matchStart, matchedLength) || ((m_options & AtWordStarts) && !isWordStartMatch(matchStart, matchedLength))) { matchStart = usearch_next(searcher, &status); ASSERT(status == U_ZERO_ERROR); goto nextMatch; } size_t newSize = size - (matchStart + 1); memmove(m_buffer.data(), m_buffer.data() + matchStart + 1, newSize * sizeof(UChar)); m_prefixLength -= std::min<size_t>(m_prefixLength, matchStart + 1); m_buffer.shrink(newSize); start = size - matchStart; return matchedLength; }
/* {{{ grapheme_strpos_utf16 - strrpos using utf16*/ int32_t grapheme_strpos_utf16(char *haystack, size_t haystack_len, char *needle, size_t needle_len, int32_t offset, int32_t *puchar_pos, int f_ignore_case, int last) { UChar *uhaystack = NULL, *uneedle = NULL; int32_t uhaystack_len = 0, uneedle_len = 0, char_pos, ret_pos, offset_pos = 0; unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE]; UBreakIterator* bi = NULL; UErrorCode status; UStringSearch* src = NULL; UCollator *coll; if(puchar_pos) { *puchar_pos = -1; } /* convert the strings to UTF-16. */ status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, haystack, haystack_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting input string to UTF-16"); status = U_ZERO_ERROR; intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, needle, needle_len, &status ); STRPOS_CHECK_STATUS(status, "Error converting needle string to UTF-16"); /* get a pointer to the haystack taking into account the offset */ status = U_ZERO_ERROR; bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status ); STRPOS_CHECK_STATUS(status, "Failed to get iterator"); status = U_ZERO_ERROR; ubrk_setText(bi, uhaystack, uhaystack_len, &status); STRPOS_CHECK_STATUS(status, "Failed to set up iterator"); status = U_ZERO_ERROR; src = usearch_open(uneedle, uneedle_len, uhaystack, uhaystack_len, "", bi, &status); STRPOS_CHECK_STATUS(status, "Error creating search object"); if(f_ignore_case) { coll = usearch_getCollator(src); status = U_ZERO_ERROR; ucol_setAttribute(coll, UCOL_STRENGTH, UCOL_SECONDARY, &status); STRPOS_CHECK_STATUS(status, "Error setting collation strength"); usearch_reset(src); } if(offset != 0) { offset_pos = grapheme_get_haystack_offset(bi, offset); if(offset_pos == -1) { status = U_ILLEGAL_ARGUMENT_ERROR; STRPOS_CHECK_STATUS(status, "Invalid search offset"); } status = U_ZERO_ERROR; usearch_setOffset(src, offset_pos, &status); STRPOS_CHECK_STATUS(status, "Invalid search offset"); } if(last) { char_pos = usearch_last(src, &status); if(char_pos < offset_pos) { /* last one is beyound our start offset */ char_pos = USEARCH_DONE; } } else { char_pos = usearch_next(src, &status); } STRPOS_CHECK_STATUS(status, "Error looking up string"); if(char_pos != USEARCH_DONE && ubrk_isBoundary(bi, char_pos)) { ret_pos = grapheme_count_graphemes(bi, uhaystack,char_pos); if(puchar_pos) { *puchar_pos = char_pos; } } else { ret_pos = -1; } if (uhaystack) { efree( uhaystack ); } if (uneedle) { efree( uneedle ); } ubrk_close (bi); usearch_close (src); return ret_pos; }
void TextSearcherICU::setOffset(size_t offset) { UErrorCode status = U_ZERO_ERROR; usearch_setOffset(m_searcher, offset, &status); DCHECK_EQ(status, U_ZERO_ERROR); }