static UChar32 U_CALLCONV utf16_caseContextIterator(void *context, int8_t dir) { UCaseContext *csc=(UCaseContext *)context; UChar32 c; if(dir<0) { /* reset for backward iteration */ csc->index=csc->cpStart; csc->dir=dir; } else if(dir>0) { /* reset for forward iteration */ csc->index=csc->cpLimit; csc->dir=dir; } else { /* continue current iteration direction */ dir=csc->dir; } if(dir<0) { if(csc->start<csc->index) { U16_PREV((const UChar *)csc->p, csc->start, csc->index, c); return c; } } else { if(csc->index<csc->limit) { U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c); return c; } } return U_SENTINEL; }
U_CAPI const UChar * U_EXPORT2 u_strTrailingWhiteSpaceStart(const UChar *s, int32_t length) { int32_t i = 0, toReturn = 0; UChar32 c = 0; if(s == NULL) { return NULL; } if(length == 0) { return s; } if(length < 0) { length = u_strlen(s); } i = length; for(;;) { toReturn = i; if(i <= 0) { break; } U16_PREV(s, 0, i, c); if(!(c == 0x20 || u_isUWhiteSpace(c))) { break; } } return s+toReturn; }
inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const { ASSERT(m_options & AtWordStarts); if (!start) return true; int size = m_buffer.size(); int offset = start; UChar32 firstCharacter; U16_GET(m_buffer.data(), 0, offset, size, firstCharacter); if (m_options & TreatMedialCapitalAsWordStart) { UChar32 previousCharacter; U16_PREV(m_buffer.data(), 0, offset, previousCharacter); if (isSeparator(firstCharacter)) { // The start of a separator run is a word start (".org" in "webkit.org"). if (!isSeparator(previousCharacter)) return true; } else if (isASCIIUpper(firstCharacter)) { // The start of an uppercase run is a word start ("Kit" in "WebKit"). if (!isASCIIUpper(previousCharacter)) return true; // The last character of an uppercase run followed by a non-separator, non-digit // is a word start ("Request" in "XMLHTTPRequest"). offset = start; U16_FWD_1(m_buffer.data(), offset, size); UChar32 nextCharacter = 0; if (offset < size) U16_GET(m_buffer.data(), 0, offset, size, nextCharacter); if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter)) return true; } else if (isASCIIDigit(firstCharacter)) { // The start of a digit run is a word start ("2" in "WebKit2"). if (!isASCIIDigit(previousCharacter)) return true; } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) { // The start of a non-separator, non-uppercase, non-digit run is a word start, // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore"). return true; } } // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes // a word, so treat the position before any CJK character as a word start. if (Character::isCJKIdeographOrSymbol(firstCharacter)) return true; size_t wordBreakSearchStart = start + length; while (wordBreakSearchStart > start) wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */); if (wordBreakSearchStart != start) return false; if (m_options & WholeWord) return static_cast<int>(start + length) == findWordEndBoundary(m_buffer.data(), m_buffer.size(), wordBreakSearchStart); return true; }
int startOfLastWordBoundaryContext(const UChar* characters, int length) { for (int i = length; i > 0;) { int last = i; UChar32 ch; U16_PREV(characters, 0, i, ch); if (!requiresContextForWordBoundary(ch)) return last; } return 0; }
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) { UChar32 hc, lc; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; UStringSearch *search = NULL; UErrorCode status = U_ZERO_ERROR; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle search = searches[nidx]; if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct usearch_setOffset(search, hidx, &status); status = U_ZERO_ERROR; pos = usearch_next(search, &status); if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
unsigned startOfLastWordBoundaryContext(StringView text) { unsigned length = text.length(); for (unsigned i = length; i > 0; ) { unsigned last = i; UChar32 ch; U16_PREV(text, 0, i, ch); if (!requiresContextForWordBoundary(ch)) return last; } return 0; }
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) { UChar32 nc, hc, lc; UChar *p; double final_score = 0.0, score = 0.0, score_for_char = 0.0; int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len; MemoryItem mem = {0}; stack_push(stack, 0, 0, 0, 0.0, final_positions); while (stack->pos >= 0) { stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions); mem = m->memo[hidx][nidx][last_idx]; if (mem.score == DBL_MAX) { // No memoized result, calculate the score for (i = nidx; i < m->needle_len;) { nidx = i; U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; } p = u_strchr32(m->haystack + hidx, nc); // TODO: Use primary collation for the find if (p == NULL) { score = 0.0; break; } pos = (int32_t)(p - m->haystack); distance = u_countChar32(m->haystack + last_idx, pos - last_idx); if (distance <= 1) score_for_char = m->max_score_per_char; else { U16_GET(m->haystack, 0, pos, m->haystack_len, hc); j = pos; U16_PREV(m->haystack, 0, j, lc); // lc is the prev character score_for_char = calc_score_for_char(m, lc, hc, distance); } j = pos; U16_NEXT(m->haystack, j, m->haystack_len, hc); hidx = j; if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions); last_idx = pos; positions[nidx] = pos; score += score_for_char; } // for(i) iterate over needle mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len); } else { score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len); } // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed if (score > final_score) { final_score = score; memcpy(final_positions, positions, sizeof(*positions) * m->needle_len); } } return final_score; }
static void demo_utf_h_macros() { static UChar input[]={ 0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062 }; UChar32 c; int32_t i; UBool isError; printf("\n* demo_utf_h_macros() -------------- ***\n\n"); printUString("iterate forward through: ", input, UPRV_LENGTHOF(input)); for(i=0; i<UPRV_LENGTHOF(input); /* U16_NEXT post-increments */) { /* Iterating forwards Codepoint at offset 0: U+0061 Codepoint at offset 1: U+10000 Codepoint at offset 3: U+10ffff Codepoint at offset 5: U+0062 */ printf("Codepoint at offset %d: U+", i); U16_NEXT(input, i, UPRV_LENGTHOF(input), c); printf("%04x\n", c); } puts(""); isError=FALSE; i=1; /* write position, gets post-incremented so needs to be in an l-value */ U16_APPEND(input, i, UPRV_LENGTHOF(input), 0x0062, isError); printUString("iterate backward through: ", input, UPRV_LENGTHOF(input)); for(i=UPRV_LENGTHOF(input); i>0; /* U16_PREV pre-decrements */) { U16_PREV(input, 0, i, c); /* Iterating backwards Codepoint at offset 5: U+0062 Codepoint at offset 3: U+10ffff Codepoint at offset 2: U+dc00 -- unpaired surrogate because lead surr. overwritten Codepoint at offset 1: U+0062 -- by this BMP code point Codepoint at offset 0: U+0061 */ printf("Codepoint at offset %d: U+%04x\n", i, c); } }
static int32_t doWriteReverse(const UChar *src, int32_t srcLength, UChar *dest, int32_t destSize, uint16_t options, UErrorCode *pErrorCode) { /* * RTL run - * * RTL runs need to be copied to the destination in reverse order * of code points, not code units, to keep Unicode characters intact. * * The general strategy for this is to read the source text * in backward order, collect all code units for a code point * (and optionally following combining characters, see below), * and copy all these code units in ascending order * to the destination for this run. * * Several options request whether combining characters * should be kept after their base characters, * whether BiDi control characters should be removed, and * whether characters should be replaced by their mirror-image * equivalent Unicode characters. */ int32_t i, j; UChar32 c; /* optimize for several combinations of options */ switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) { case 0: /* * With none of the "complicated" options set, the destination * run will have the same length as the source run, * and there is no mirroring and no keeping combining characters * with their base characters. */ if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } destSize=srcLength; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units for one base character */ U16_BACK_1(src, 0, srcLength); /* copy this base character */ j=srcLength; do { *dest++=src[j++]; } while(j<i); } while(srcLength>0); break; case UBIDI_KEEP_BASE_COMBINING: /* * Here, too, the destination * run will have the same length as the source run, * and there is no mirroring. * We do need to keep combining characters with their base characters. */ if(destSize<srcLength) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return srcLength; } destSize=srcLength; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units and modifier letters for one base character */ do { U16_PREV(src, 0, srcLength, c); } while(srcLength>0 && IS_COMBINING(u_charType(c))); /* copy this "user character" */ j=srcLength; do { *dest++=src[j++]; } while(j<i); } while(srcLength>0); break; default: /* * With several "complicated" options set, this is the most * general and the slowest copying of an RTL run. * We will do mirroring, remove BiDi controls, and * keep combining characters with their base characters * as requested. */ if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) { i=srcLength; } else { /* we need to find out the destination length of the run, which will not include the BiDi control characters */ int32_t length=srcLength; UChar ch; i=0; do { ch=*src++; if(!IS_BIDI_CONTROL_CHAR(ch)) { ++i; } } while(--length>0); src-=srcLength; } if(destSize<i) { *pErrorCode=U_BUFFER_OVERFLOW_ERROR; return i; } destSize=i; /* preserve character integrity */ do { /* i is always after the last code unit known to need to be kept in this segment */ i=srcLength; /* collect code units for one base character */ U16_PREV(src, 0, srcLength, c); if(options&UBIDI_KEEP_BASE_COMBINING) { /* collect modifier letters for this base character */ while(srcLength>0 && IS_COMBINING(u_charType(c))) { U16_PREV(src, 0, srcLength, c); } } if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) { /* do not copy this BiDi control character */ continue; } /* copy this "user character" */ j=srcLength; if(options&UBIDI_DO_MIRRORING) { /* mirror only the base character */ int32_t k=0; c=u_charMirror(c); U16_APPEND_UNSAFE(dest, k, c); dest+=k; j+=k; } while(j<i) { *dest++=src[j++]; } } while(srcLength>0); break; } /* end of switch */ return destSize; }
static void TestNextPrevChar(){ static UChar input[]={0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062, 0xd841, 0xd7ff, 0xd841, 0xdc41, 0xdc00, 0x0000}; static UChar32 result[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 0x0061, 0x0061, 0x0061, 0x0000, 0x0000, 0x0000, 0x10000, 0x10000, 0x10000, 0x120400, 0xdc00, UTF_ERROR_VALUE, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0x20441, 0x20441, 0x20441, 0x10ffff, 0x10ffff, 0x10ffff, 0xd841, 0xd841, UTF_ERROR_VALUE, 0xdfff, 0xdfff, UTF_ERROR_VALUE, 0xd7ff, 0xd7ff, 0xd7ff, 0x0062, 0x0062, 0x0062, 0xd841, 0xd841, UTF_ERROR_VALUE, 0x1ffff, 0xd841, UTF_ERROR_VALUE, 0x0062, 0x0062, 0x0062, 0xd7ff, 0xd7ff, 0xd7ff, 0x10ffff, 0x10ffff, 0x10ffff, 0x20441, 0x20441, 0x20441, 0xdbff, 0xdbff, UTF_ERROR_VALUE, 0xdc41, 0xdc41, UTF_ERROR_VALUE, 0x10000, 0x10000, 0x10000, 0xdc00, 0xdc00, UTF_ERROR_VALUE, 0xd800, 0xd800, UTF_ERROR_VALUE, 0x0000, 0x0000, 0x0000, 0x0061, 0x0061, 0x0061 }; static uint16_t movedOffset[]={ /*next_unsafe next_safe_ns next_safe_s prev_unsafe prev_safe_ns prev_safe_s*/ 1, 1, 1, 11, 11, 11, 3, 3, 3, 9, 10 , 10, 3, 3, 3, 8, 8, 8, 5, 5, 4, 8, 8, 8, 5, 5, 5, 7, 7, 7, 6, 6, 6, 6, 6, 6, 8, 7, 7, 5, 5, 5, 8, 8, 8, 3, 3, 3, 10, 10, 10, 3, 3, 3, 10, 10, 10, 1, 1, 1, 11, 11, 11, 1, 1, 1, 12, 12, 12, 0, 0, 0, }; UChar32 c=0x0000; uint16_t i=0; uint16_t offset=0, setOffset=0; for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++){ setOffset=offset; UTF16_NEXT_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; U16_NEXT_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i], setOffset); } if(c != result[i]){ log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c); if(setOffset != movedOffset[i+1]){ log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+1], setOffset); } if(c != result[i+1]){ log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c); } setOffset=offset; UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE); if(setOffset != movedOffset[i+1]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+2], setOffset); } if(c != result[i+2]){ log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c); } i=(uint16_t)(i+6); } i=0; for(offset=(uint16_t)sizeof(input)/U_SIZEOF_UCHAR; offset > 0; --offset){ setOffset=offset; UTF16_PREV_CHAR_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; U16_PREV_UNSAFE(input, setOffset, c); if(setOffset != movedOffset[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+3], setOffset); } if(c != result[i+3]){ log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE); if(setOffset != movedOffset[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; U16_PREV(input, 0, setOffset, c); if(setOffset != movedOffset[i+4]){ log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+4], setOffset); } if(c != result[i+4]){ log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c); } setOffset=offset; UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, TRUE); if(setOffset != movedOffset[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n", offset, movedOffset[i+5], setOffset); } if(c != result[i+5]){ log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c); } i=(uint16_t)(i+6); } }