Beispiel #1
0
static UChar32 U_CALLCONV
utf16_caseContextIterator(void *context, int8_t dir) {
    UCaseContext *csc=(UCaseContext *)context;
    UChar32 c;

    if(dir<0) {
        /* reset for backward iteration */
        csc->index=csc->cpStart;
        csc->dir=dir;
    } else if(dir>0) {
        /* reset for forward iteration */
        csc->index=csc->cpLimit;
        csc->dir=dir;
    } else {
        /* continue current iteration direction */
        dir=csc->dir;
    }

    if(dir<0) {
        if(csc->start<csc->index) {
            U16_PREV((const UChar *)csc->p, csc->start, csc->index, c);
            return c;
        }
    } else {
        if(csc->index<csc->limit) {
            U16_NEXT((const UChar *)csc->p, csc->index, csc->limit, c);
            return c;
        }
    }
    return U_SENTINEL;
}
Beispiel #2
0
U_CAPI const UChar * U_EXPORT2
u_strTrailingWhiteSpaceStart(const UChar *s, int32_t length) {
    int32_t i = 0, toReturn = 0;
    UChar32 c = 0;
    
    if(s == NULL) {
        return NULL;
    }
    if(length == 0) {
        return s;
    }
    
    if(length < 0) {
        length = u_strlen(s);
    }
    
    i = length;
    for(;;) {
        toReturn = i;
        if(i <= 0) {
            break;
        }
        U16_PREV(s, 0, i, c);
        if(!(c == 0x20 || u_isUWhiteSpace(c))) {
            break;
        }
    }
    
    return s+toReturn;
}
inline bool SearchBuffer::isWordStartMatch(size_t start, size_t length) const
{
    ASSERT(m_options & AtWordStarts);

    if (!start)
        return true;

    int size = m_buffer.size();
    int offset = start;
    UChar32 firstCharacter;
    U16_GET(m_buffer.data(), 0, offset, size, firstCharacter);

    if (m_options & TreatMedialCapitalAsWordStart) {
        UChar32 previousCharacter;
        U16_PREV(m_buffer.data(), 0, offset, previousCharacter);

        if (isSeparator(firstCharacter)) {
            // The start of a separator run is a word start (".org" in "webkit.org").
            if (!isSeparator(previousCharacter))
                return true;
        } else if (isASCIIUpper(firstCharacter)) {
            // The start of an uppercase run is a word start ("Kit" in "WebKit").
            if (!isASCIIUpper(previousCharacter))
                return true;
            // The last character of an uppercase run followed by a non-separator, non-digit
            // is a word start ("Request" in "XMLHTTPRequest").
            offset = start;
            U16_FWD_1(m_buffer.data(), offset, size);
            UChar32 nextCharacter = 0;
            if (offset < size)
                U16_GET(m_buffer.data(), 0, offset, size, nextCharacter);
            if (!isASCIIUpper(nextCharacter) && !isASCIIDigit(nextCharacter) && !isSeparator(nextCharacter))
                return true;
        } else if (isASCIIDigit(firstCharacter)) {
            // The start of a digit run is a word start ("2" in "WebKit2").
            if (!isASCIIDigit(previousCharacter))
                return true;
        } else if (isSeparator(previousCharacter) || isASCIIDigit(previousCharacter)) {
            // The start of a non-separator, non-uppercase, non-digit run is a word start,
            // except after an uppercase. ("org" in "webkit.org", but not "ore" in "WebCore").
            return true;
        }
    }

    // Chinese and Japanese lack word boundary marks, and there is no clear agreement on what constitutes
    // a word, so treat the position before any CJK character as a word start.
    if (Character::isCJKIdeographOrSymbol(firstCharacter))
        return true;

    size_t wordBreakSearchStart = start + length;
    while (wordBreakSearchStart > start)
        wordBreakSearchStart = findNextWordFromIndex(m_buffer.data(), m_buffer.size(), wordBreakSearchStart, false /* backwards */);
    if (wordBreakSearchStart != start)
        return false;
    if (m_options & WholeWord)
        return static_cast<int>(start + length) == findWordEndBoundary(m_buffer.data(), m_buffer.size(), wordBreakSearchStart);
    return true;
}
Beispiel #4
0
int startOfLastWordBoundaryContext(const UChar* characters, int length) {
  for (int i = length; i > 0;) {
    int last = i;
    UChar32 ch;
    U16_PREV(characters, 0, i, ch);
    if (!requiresContextForWordBoundary(ch))
      return last;
  }
  return 0;
}
Beispiel #5
0
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions, UStringSearch **searches) {
    UChar32 hc, lc;
    double final_score = 0.0, score = 0.0, score_for_char = 0.0;
    int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
    MemoryItem mem = {0};
    UStringSearch *search = NULL;
    UErrorCode status = U_ZERO_ERROR;

    stack_push(stack, 0, 0, 0, 0.0, final_positions);

    while (stack->pos >= 0) {
        stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
        mem = m->memo[hidx][nidx][last_idx];
        if (mem.score == DBL_MAX) {
            // No memoized result, calculate the score
            for (i = nidx; i < m->needle_len;) {
                nidx = i;
                U16_FWD_1(m->needle, i, m->needle_len);// i now points to next char in needle 
                search = searches[nidx];
                if (search == NULL || m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
                status = U_ZERO_ERROR; // We ignore any errors as we already know that hidx is correct
                usearch_setOffset(search, hidx, &status);
                status = U_ZERO_ERROR;
                pos = usearch_next(search, &status);
                if (pos == USEARCH_DONE) { score = 0.0; break; } // No matches found
                distance = u_countChar32(m->haystack + last_idx, pos - last_idx);  
                if (distance <= 1) score_for_char = m->max_score_per_char;
                else {
                    U16_GET(m->haystack, 0, pos, m->haystack_len, hc); 
                    j = pos;
                    U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
                    score_for_char = calc_score_for_char(m, lc, hc, distance);
                }
                j = pos;
                U16_NEXT(m->haystack, j, m->haystack_len, hc); 
                hidx = j;
                if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
                last_idx = pos; 
                positions[nidx] = pos; 
                score += score_for_char;
            } // for(i) iterate over needle
            mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);

        } else {
            score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
        }
        // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
        if (score > final_score) {
            final_score = score;
            memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
        }
    }
    return final_score;
}
unsigned startOfLastWordBoundaryContext(StringView text)
{
    unsigned length = text.length();
    for (unsigned i = length; i > 0; ) {
        unsigned last = i;
        UChar32 ch;
        U16_PREV(text, 0, i, ch);
        if (!requiresContextForWordBoundary(ch))
            return last;
    }
    return 0;
}
Beispiel #7
0
static double process_item(MatchInfo *m, Stack *stack, int32_t *final_positions) {
    UChar32 nc, hc, lc;
    UChar *p;
    double final_score = 0.0, score = 0.0, score_for_char = 0.0;
    int32_t pos, i, j, hidx, nidx, last_idx, distance, *positions = final_positions + m->needle_len;
    MemoryItem mem = {0};

    stack_push(stack, 0, 0, 0, 0.0, final_positions);

    while (stack->pos >= 0) {
        stack_pop(stack, &hidx, &nidx, &last_idx, &score, positions);
        mem = m->memo[hidx][nidx][last_idx];
        if (mem.score == DBL_MAX) {
            // No memoized result, calculate the score
            for (i = nidx; i < m->needle_len;) {
                nidx = i;
                U16_NEXT(m->needle, i, m->needle_len, nc); // i now points to next char in needle 
                if (m->haystack_len - hidx < m->needle_len - nidx) { score = 0.0; break; }
                p = u_strchr32(m->haystack + hidx, nc);  // TODO: Use primary collation for the find
                if (p == NULL) { score = 0.0; break; }
                pos = (int32_t)(p - m->haystack);
                distance = u_countChar32(m->haystack + last_idx, pos - last_idx);  
                if (distance <= 1) score_for_char = m->max_score_per_char;
                else {
                    U16_GET(m->haystack, 0, pos, m->haystack_len, hc); 
                    j = pos;
                    U16_PREV(m->haystack, 0, j, lc); // lc is the prev character
                    score_for_char = calc_score_for_char(m, lc, hc, distance);
                }
                j = pos;
                U16_NEXT(m->haystack, j, m->haystack_len, hc); 
                hidx = j;
                if (m->haystack_len - hidx >= m->needle_len - nidx) stack_push(stack, hidx, nidx, last_idx, score, positions);
                last_idx = pos; 
                positions[nidx] = pos; 
                score += score_for_char;
            } // for(i) iterate over needle
            mem.score = score; memcpy(mem.positions, positions, sizeof(*positions) * m->needle_len);

        } else {
            score = mem.score; memcpy(positions, mem.positions, sizeof(*positions) * m->needle_len);
        }
        // We have calculated the score for this hidx, nidx, last_idx combination, update final_score and final_positions, if needed
        if (score > final_score) {
            final_score = score;
            memcpy(final_positions, positions, sizeof(*positions) * m->needle_len);
        }
    }
    return final_score;
}
Beispiel #8
0
static void
demo_utf_h_macros() {
    static UChar input[]={ 0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062 };
    UChar32 c;
    int32_t i;
    UBool isError;

    printf("\n* demo_utf_h_macros() -------------- ***\n\n");

    printUString("iterate forward through: ", input, UPRV_LENGTHOF(input));
    for(i=0; i<UPRV_LENGTHOF(input); /* U16_NEXT post-increments */) {
        /* Iterating forwards 
           Codepoint at offset 0: U+0061
           Codepoint at offset 1: U+10000
           Codepoint at offset 3: U+10ffff
           Codepoint at offset 5: U+0062
        */
        printf("Codepoint at offset %d: U+", i);
        U16_NEXT(input, i, UPRV_LENGTHOF(input), c);
        printf("%04x\n", c); 
    }

    puts("");

    isError=FALSE;
    i=1; /* write position, gets post-incremented so needs to be in an l-value */
    U16_APPEND(input, i, UPRV_LENGTHOF(input), 0x0062, isError);

    printUString("iterate backward through: ", input, UPRV_LENGTHOF(input));
    for(i=UPRV_LENGTHOF(input); i>0; /* U16_PREV pre-decrements */) {
        U16_PREV(input, 0, i, c);
        /* Iterating backwards
           Codepoint at offset 5: U+0062
           Codepoint at offset 3: U+10ffff
           Codepoint at offset 2: U+dc00 -- unpaired surrogate because lead surr. overwritten
           Codepoint at offset 1: U+0062 -- by this BMP code point
           Codepoint at offset 0: U+0061
        */
        printf("Codepoint at offset %d: U+%04x\n", i, c);
    }
}
Beispiel #9
0
static int32_t
doWriteReverse(const UChar *src, int32_t srcLength,
               UChar *dest, int32_t destSize,
               uint16_t options,
               UErrorCode *pErrorCode) {
    /*
     * RTL run -
     *
     * RTL runs need to be copied to the destination in reverse order
     * of code points, not code units, to keep Unicode characters intact.
     *
     * The general strategy for this is to read the source text
     * in backward order, collect all code units for a code point
     * (and optionally following combining characters, see below),
     * and copy all these code units in ascending order
     * to the destination for this run.
     *
     * Several options request whether combining characters
     * should be kept after their base characters,
     * whether BiDi control characters should be removed, and
     * whether characters should be replaced by their mirror-image
     * equivalent Unicode characters.
     */
    int32_t i, j;
    UChar32 c;

    /* optimize for several combinations of options */
    switch(options&(UBIDI_REMOVE_BIDI_CONTROLS|UBIDI_DO_MIRRORING|UBIDI_KEEP_BASE_COMBINING)) {
    case 0:
        /*
         * With none of the "complicated" options set, the destination
         * run will have the same length as the source run,
         * and there is no mirroring and no keeping combining characters
         * with their base characters.
         */
        if(destSize<srcLength) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            return srcLength;
        }
        destSize=srcLength;

        /* preserve character integrity */
        do {
            /* i is always after the last code unit known to need to be kept in this segment */
            i=srcLength;

            /* collect code units for one base character */
            U16_BACK_1(src, 0, srcLength);

            /* copy this base character */
            j=srcLength;
            do {
                *dest++=src[j++];
            } while(j<i);
        } while(srcLength>0);
        break;
    case UBIDI_KEEP_BASE_COMBINING:
        /*
         * Here, too, the destination
         * run will have the same length as the source run,
         * and there is no mirroring.
         * We do need to keep combining characters with their base characters.
         */
        if(destSize<srcLength) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            return srcLength;
        }
        destSize=srcLength;

        /* preserve character integrity */
        do {
            /* i is always after the last code unit known to need to be kept in this segment */
            i=srcLength;

            /* collect code units and modifier letters for one base character */
            do {
                U16_PREV(src, 0, srcLength, c);
            } while(srcLength>0 && IS_COMBINING(u_charType(c)));

            /* copy this "user character" */
            j=srcLength;
            do {
                *dest++=src[j++];
            } while(j<i);
        } while(srcLength>0);
        break;
    default:
        /*
         * With several "complicated" options set, this is the most
         * general and the slowest copying of an RTL run.
         * We will do mirroring, remove BiDi controls, and
         * keep combining characters with their base characters
         * as requested.
         */
        if(!(options&UBIDI_REMOVE_BIDI_CONTROLS)) {
            i=srcLength;
        } else {
            /* we need to find out the destination length of the run,
               which will not include the BiDi control characters */
            int32_t length=srcLength;
            UChar ch;

            i=0;
            do {
                ch=*src++;
                if(!IS_BIDI_CONTROL_CHAR(ch)) {
                    ++i;
                }
            } while(--length>0);
            src-=srcLength;
        }

        if(destSize<i) {
            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
            return i;
        }
        destSize=i;

        /* preserve character integrity */
        do {
            /* i is always after the last code unit known to need to be kept in this segment */
            i=srcLength;

            /* collect code units for one base character */
            U16_PREV(src, 0, srcLength, c);
            if(options&UBIDI_KEEP_BASE_COMBINING) {
                /* collect modifier letters for this base character */
                while(srcLength>0 && IS_COMBINING(u_charType(c))) {
                    U16_PREV(src, 0, srcLength, c);
                }
            }

            if(options&UBIDI_REMOVE_BIDI_CONTROLS && IS_BIDI_CONTROL_CHAR(c)) {
                /* do not copy this BiDi control character */
                continue;
            }

            /* copy this "user character" */
            j=srcLength;
            if(options&UBIDI_DO_MIRRORING) {
                /* mirror only the base character */
                int32_t k=0;
                c=u_charMirror(c);
                U16_APPEND_UNSAFE(dest, k, c);
                dest+=k;
                j+=k;
            }
            while(j<i) {
                *dest++=src[j++];
            }
        } while(srcLength>0);
        break;
    } /* end of switch */

    return destSize;
}
Beispiel #10
0
static void TestNextPrevChar(){

    static UChar input[]={0x0061, 0xd800, 0xdc00, 0xdbff, 0xdfff, 0x0062, 0xd841, 0xd7ff, 0xd841, 0xdc41, 0xdc00, 0x0000};
    static UChar32 result[]={
    /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        0x0061,        0x0061,       0x0061,           0x0000,       0x0000,          0x0000,
        0x10000,       0x10000,      0x10000,          0x120400,     0xdc00,          UTF_ERROR_VALUE, 
        0xdc00,        0xdc00,       UTF_ERROR_VALUE,  0x20441,      0x20441,         0x20441,
        0x10ffff,      0x10ffff,     0x10ffff,         0xd841,       0xd841,          UTF_ERROR_VALUE, 
        0xdfff,        0xdfff,       UTF_ERROR_VALUE,  0xd7ff,       0xd7ff,          0xd7ff,   
        0x0062,        0x0062,       0x0062,           0xd841,       0xd841,          UTF_ERROR_VALUE,     
        0x1ffff,       0xd841,       UTF_ERROR_VALUE,  0x0062,       0x0062,          0x0062,
        0xd7ff,        0xd7ff,       0xd7ff,           0x10ffff,     0x10ffff,        0x10ffff,
        0x20441,       0x20441,      0x20441,          0xdbff,       0xdbff,          UTF_ERROR_VALUE,      
        0xdc41,        0xdc41,       UTF_ERROR_VALUE,  0x10000,      0x10000,         0x10000,
        0xdc00,        0xdc00,       UTF_ERROR_VALUE,  0xd800,       0xd800,          UTF_ERROR_VALUE,
        0x0000,        0x0000,       0x0000,           0x0061,       0x0061,          0x0061
    };
    static uint16_t movedOffset[]={
   /*next_unsafe    next_safe_ns  next_safe_s       prev_unsafe   prev_safe_ns     prev_safe_s*/
        1,            1,           1,                11,           11,               11,
        3,            3,           3,                9,            10 ,              10, 
        3,            3,           3,                8,            8,                8,  
        5,            5,           4,                8,            8,                8, 
        5,            5,           5,                7,            7,                7,
        6,            6,           6,                6,            6,                6,
        8,            7,           7,                5,            5,                5,
        8,            8,           8,                3,            3,                3, 
        10,           10,          10,               3,            3,                3,         
        10,           10,          10,               1,            1,                1, 
        11,           11,          11,               1,            1,                1, 
        12,           12,          12,               0,            0,                0, 
    };
      

    UChar32 c=0x0000;
    uint16_t i=0;
    uint16_t offset=0, setOffset=0;
    for(offset=0; offset<sizeof(input)/U_SIZEOF_UCHAR; offset++){
         setOffset=offset;
         UTF16_NEXT_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i]){
             log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i], setOffset);
         }
         if(c != result[i]){
             log_err("ERROR: UTF16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
         }

         setOffset=offset;
         U16_NEXT_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i]){
             log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i], setOffset);
         }
         if(c != result[i]){
             log_err("ERROR: U16_NEXT_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i], c);
         }

         setOffset=offset;
         UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, FALSE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(c != result[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         U16_NEXT(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: U16_NEXT failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+1], setOffset);
         }
         if(c != result[i+1]){
             log_err("ERROR: U16_NEXT failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+1], c);
         }

         setOffset=offset;
         UTF16_NEXT_CHAR_SAFE(input, setOffset, sizeof(input)/U_SIZEOF_UCHAR, c, TRUE);
         if(setOffset != movedOffset[i+1]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+2], setOffset);
         }
         if(c != result[i+2]){
             log_err("ERROR: UTF16_NEXT_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+2], c);
         }

         i=(uint16_t)(i+6);
    }
    i=0;
    for(offset=(uint16_t)sizeof(input)/U_SIZEOF_UCHAR; offset > 0; --offset){
         setOffset=offset;
         UTF16_PREV_CHAR_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         }
         if(c != result[i+3]){
             log_err("ERROR: UTF16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
         }

         setOffset=offset;
         U16_PREV_UNSAFE(input, setOffset, c);
         if(setOffset != movedOffset[i+3]){
             log_err("ERROR: U16_PREV_CHAR_UNSAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+3], setOffset);
         }
         if(c != result[i+3]){
             log_err("ERROR: U16_PREV_CHAR_UNSAFE failed for offset=%ld. Expected:%lx Got:%lx\n", offset, result[i+3], c);
         }

         setOffset=offset;
         UTF16_PREV_CHAR_SAFE(input, 0, setOffset, c, FALSE);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(c != result[i+4]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         U16_PREV(input, 0, setOffset, c);
         if(setOffset != movedOffset[i+4]){
             log_err("ERROR: U16_PREV failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+4], setOffset);
         }
         if(c != result[i+4]){
             log_err("ERROR: U16_PREV failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+4], c);
         }

         setOffset=offset;
         UTF16_PREV_CHAR_SAFE(input, 0,  setOffset, c, TRUE);
         if(setOffset != movedOffset[i+5]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed to move the offset correctly at %d\n ExpectedOffset:%d Got %d\n",
                 offset, movedOffset[i+5], setOffset);
         } 
         if(c != result[i+5]){
             log_err("ERROR: UTF16_PREV_CHAR_SAFE(strict) failed for input=%ld. Expected:%lx Got:%lx\n", offset, result[i+5], c);
         }

         i=(uint16_t)(i+6);
    }

}