Пример #1
0
/**
 * Returns true if the specfied position is a boundary position.  As a side
 * effect, leaves the iterator pointing to the first boundary position at
 * or after "offset".
 * @param offset the offset to check.
 * @return True if "offset" is a boundary position.
 */
UBool BreakIterator::isBoundary(int32_t offset) {
    // the beginning index of the iterator is always a boundary position by definition
    if (offset == 0) {
        first();       // For side effects on current position, tag values.
        return TRUE;
    }

    if (offset == (int32_t)utext_nativeLength(fText)) {
        last();       // For side effects on current position, tag values.
        return TRUE;
    }

    // out-of-range indexes are never boundary positions
    if (offset < 0) {
        first();       // For side effects on current position, tag values.
        return FALSE;
    }

    if (offset > utext_nativeLength(fText)) {
        last();        // For side effects on current position, tag values.
        return FALSE;
    }

    // otherwise, we can use following() on the position before the specified
    // one and return true if the position we get back is the one the user
    // specified
    utext_previous32From(fText, offset);
    int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText);
    UBool    result  = following(backOne) == offset;
    return result;
}
Пример #2
0
/**
 * Sets the current iteration position to the end of the text.
 * @return The text's past-the-end offset.
 */
int32_t BreakIterator::last(void) {
    reset();
    if (fText == NULL) {
        fLastRuleStatusIndex  = 0;
        fLastStatusIndexValid = TRUE;
        return BreakIterator::DONE;
    }

    fLastStatusIndexValid = FALSE;
    int32_t pos = (int32_t)utext_nativeLength(fText);
    utext_setNativeIndex(fText, pos);
    return pos;
}
Пример #3
0
//---------------------------------------------------------------------
//
//   pattern
//
//---------------------------------------------------------------------
UnicodeString RegexPattern::pattern() const {
    if (fPatternString != NULL) {
        return *fPatternString;
    } else if (fPattern == NULL) {
        return UnicodeString();
    } else {
        UErrorCode status = U_ZERO_ERROR;
        int64_t nativeLen = utext_nativeLength(fPattern);
        int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error
        UnicodeString result;

        status = U_ZERO_ERROR;
        UChar *resultChars = result.getBuffer(len16);
        utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning
        result.releaseBuffer(len16);

        return result;
    }
}
Пример #4
0
/**
 * Sets the iterator to refer to the last boundary position before the
 * specified position.
 * @offset The position to begin searching for a break from.
 * @return The position of the last boundary before the starting position.
 */
int32_t BreakIterator::preceding(int32_t offset) {
    // if we have cached break positions and offset is in the range
    // covered by them, use them
    if (fCachedBreakPositions != NULL) {
        // TODO: binary search?
        // TODO: What if offset is outside range, but break is not?
        if (offset > fCachedBreakPositions[0]
                && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
            fPositionInCache = 0;
            while (fPositionInCache < fNumCachedBreakPositions
                   && offset > fCachedBreakPositions[fPositionInCache])
                ++fPositionInCache;
            --fPositionInCache;
            // If we're at the beginning of the cache, need to reevaluate the
            // rule status
            if (fPositionInCache <= 0) {
                fLastStatusIndexValid = FALSE;
            }
            utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]);
            return fCachedBreakPositions[fPositionInCache];
        }
        else {
            reset();
        }
    }

    // if the offset passed in is already past the end of the text,
    // just return DONE; if it's before the beginning, return the
    // text's starting offset
    if (fText == NULL || offset > utext_nativeLength(fText)) {
        // return BreakIterator::DONE;
        return last();
    }
    else if (offset < 0) {
        return first();
    }

    // if we start by updating the current iteration position to the
    // position specified by the caller, we can just use previous()
    // to carry out this operation

    if (fSafeFwdTable != NULL) {
        // new rule syntax
        utext_setNativeIndex(fText, offset);
        int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
        if (newOffset != offset) {
            // Will come here if specified offset was not a code point boundary AND
            //   the underlying implmentation is using UText, which snaps any non-code-point-boundary
            //   indices to the containing code point.
            // For breakitereator::preceding only, these non-code-point indices need to be moved
            //   up to refer to the following codepoint.
            UTEXT_NEXT32(fText);
            offset = (int32_t)UTEXT_GETNATIVEINDEX(fText);
        }

        // TODO:  (synwee) would it be better to just check for being in the middle of a surrogate pair,
        //        rather than adjusting the position unconditionally?
        //        (Change would interact with safe rules.)
        // TODO:  change RBBI behavior for off-boundary indices to match that of UText?
        //        affects only preceding(), seems cleaner, but is slightly different.
        UTEXT_PREVIOUS32(fText);
        handleNext(fSafeFwdTable);
        int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText);
        while (result >= offset) {
            result = previous();
        }
        return result;
    }
    if (fSafeRevTable != NULL) {
        // backup plan if forward safe table is not available
        //  TODO:  check whether this path can be discarded
        //         It's probably OK to say that rules must supply both safe tables
        //            if they use safe tables at all.  We have certainly never described
        //            to anyone how to work with just one safe table.
        utext_setNativeIndex(fText, offset);
        UTEXT_NEXT32(fText);
        
        // handle previous will give result <= offset
        handlePrevious(fSafeRevTable);

        // next will give result 0 or 1 boundary away from offset,
        // most of the time
        // we have to
        int32_t oldresult = next();
        while (oldresult < offset) {
            int32_t result = next();
            if (result >= offset) {
                return oldresult;
            }
            oldresult = result;
        }
        int32_t result = previous();
        if (result >= offset) {
            return previous();
        }
        return result;
    }

    // old rule syntax
    utext_setNativeIndex(fText, offset);
    return previous();
}
Пример #5
0
/**
 * Sets the iterator to refer to the first boundary position following
 * the specified position.
 * @offset The position from which to begin searching for a break position.
 * @return The position of the first break after the current position.
 */
int32_t BreakIterator::following(int32_t offset) {
    // if we have cached break positions and offset is in the range
    // covered by them, use them
    // TODO: could use binary search
    // TODO: what if offset is outside range, but break is not?
    if (fCachedBreakPositions != NULL) {
        if (offset >= fCachedBreakPositions[0]
                && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) {
            fPositionInCache = 0;
            // We are guaranteed not to leave the array due to range test above
            while (offset >= fCachedBreakPositions[fPositionInCache]) {
                ++fPositionInCache;
            }
            int32_t pos = fCachedBreakPositions[fPositionInCache];
            utext_setNativeIndex(fText, pos);
            return pos;
        }
        else {
            reset();
        }
    }

    // if the offset passed in is already past the end of the text,
    // just return DONE; if it's before the beginning, return the
    // text's starting offset
    fLastRuleStatusIndex  = 0;
    fLastStatusIndexValid = TRUE;
    if (fText == NULL || offset >= utext_nativeLength(fText)) {
        last();
        return next();
    }
    else if (offset < 0) {
        return first();
    }

    // otherwise, set our internal iteration position (temporarily)
    // to the position passed in.  If this is the _beginning_ position,
    // then we can just use next() to get our return value

    int32_t result = 0;

    if (fSafeRevTable != NULL) {
        // new rule syntax
        utext_setNativeIndex(fText, offset);
        // move forward one codepoint to prepare for moving back to a
        // safe point.
        // this handles offset being between a supplementary character
        UTEXT_NEXT32(fText);
        // handlePrevious will move most of the time to < 1 boundary away
        handlePrevious(fSafeRevTable);
        int32_t result = next();
        while (result <= offset) {
            result = next();
        }
        return result;
    }
    if (fSafeFwdTable != NULL) {
        // backup plan if forward safe table is not available
        utext_setNativeIndex(fText, offset);
        UTEXT_PREVIOUS32(fText);
        // handle next will give result >= offset
        handleNext(fSafeFwdTable);
        // previous will give result 0 or 1 boundary away from offset,
        // most of the time
        // we have to
        int32_t oldresult = previous();
        while (oldresult > offset) {
            int32_t result = previous();
            if (result <= offset) {
                return oldresult;
            }
            oldresult = result;
        }
        int32_t result = next();
        if (result <= offset) {
            return next();
        }
        return result;
    }
    // otherwise, we have to sync up first.  Use handlePrevious() to back
    // up to a known break position before the specified position (if
    // we can determine that the specified position is a break position,
    // we don't back up at all).  This may or may not be the last break
    // position at or before our starting position.  Advance forward
    // from here until we've passed the starting position.  The position
    // we stop on will be the first break position after the specified one.
    // old rule syntax

    utext_setNativeIndex(fText, offset);
    if (offset==0 || 
        offset==1  && utext_getNativeIndex(fText)==0) {
        return next();
    }
    result = previous();

    while (result != BreakIterator::DONE && result <= offset) {
        result = next();
    }

    return result;
}
Пример #6
0
static void TestAPI(void) {
    UErrorCode      status = U_ZERO_ERROR;
    UBool           gFailed = FALSE;
    (void)gFailed;   /* Suppress set but not used warning. */

    /* Open    */
    {
        UText           utLoc = UTEXT_INITIALIZER;
        const char *    cString = "\x61\x62\x63\x64";
        UChar           uString[]  = {0x41, 0x42, 0x43, 0};
        UText          *uta;
        UText          *utb;
        UChar           c;

        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        c = utext_next32(uta);
        TEST_ASSERT(c == 0x41);
        utb = utext_close(uta); 
        TEST_ASSERT(utb == NULL);

        uta = utext_openUTF8(&utLoc, cString, -1, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(uta == &utLoc);

        uta = utext_close(&utLoc);
        TEST_ASSERT(uta == &utLoc);
    }

    /* utext_clone()  */
    {
        UChar   uString[]  = {0x41, 0x42, 0x43, 0};
        int64_t len;
        UText   *uta;
        UText   *utb;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        utb = utext_clone(NULL, uta, FALSE, FALSE, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(utb != NULL);
        TEST_ASSERT(utb != uta);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        utext_close(uta);
        utext_close(utb);
    }

    /* basic access functions  */
    {
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UText     *uta;
        UChar32   c;
        int64_t   len;
        UBool     b;
        int64_t   i;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_ASSERT(uta!=NULL);
        TEST_SUCCESS(status);
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==TRUE);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==FALSE);

        c = utext_char32At(uta, 0);
        TEST_ASSERT(c==uString[0]);
        
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = utext_previous32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32From(uta, 1);
        TEST_ASSERT(c==uString[1]);
        c = utext_next32From(uta, u_strlen(uString));
        TEST_ASSERT(c==U_SENTINEL);

        c = utext_previous32From(uta, 2);
        TEST_ASSERT(c==uString[1]);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i == 1);

        utext_setNativeIndex(uta, 0);
        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==1);

        b = utext_moveIndex32(uta, u_strlen(uString)-1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==FALSE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        utext_setNativeIndex(uta, 0);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==uString[0]);
        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==U_SENTINEL);


        utext_close(uta);
    }

    {
        /*
         * UText opened on a NULL string with zero length
         */
        UText    *uta;
        UChar32   c;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);

        uta = utext_openUTF8(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);
    }


    {
        /*
         * extract
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UChar     buf[100];
        int32_t   i;
        /* Test pinning of input bounds */
        UChar     uString2[]  = {0x41, 0x42, 0x43, 0x44, 0x45,
                                 0x46, 0x47, 0x48, 0x49, 0x4A, 0};
        UChar *   uString2Ptr = uString2 + 5;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        i = utext_extract(uta, 0, 100, NULL, 0, &status);
        TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(i == u_strlen(uString));

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, 0, 100, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString));
        i = u_strcmp(uString, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);

        /* Test pinning of input bounds */
        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString2Ptr, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, -3, 20, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString2Ptr));
        i = u_strcmp(uString2Ptr, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);
    }

    {
        /*
         *  Copy, Replace, isWritable
         *    Can't create an editable UText from plain C, so all we
         *    can easily do is check that errors returned.
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UBool     b;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        b = utext_isWritable(uta);
        TEST_ASSERT(b == FALSE);

        b = utext_hasMetaData(uta);
        TEST_ASSERT(b == FALSE);

        utext_replace(uta,
                      0, 1,     /* start, limit */
                      uString, -1,  /* replacement, replacement length */
                      &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);


        utext_copy(uta,
                   0, 1,         /* start, limit      */
                   2,            /* destination index */
                   FALSE,        /* move flag         */
                   &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);

        utext_close(uta);
    }


}