/** * Returns true if the specfied position is a boundary position. As a side * effect, leaves the iterator pointing to the first boundary position at * or after "offset". * @param offset the offset to check. * @return True if "offset" is a boundary position. */ UBool BreakIterator::isBoundary(int32_t offset) { // the beginning index of the iterator is always a boundary position by definition if (offset == 0) { first(); // For side effects on current position, tag values. return TRUE; } if (offset == (int32_t)utext_nativeLength(fText)) { last(); // For side effects on current position, tag values. return TRUE; } // out-of-range indexes are never boundary positions if (offset < 0) { first(); // For side effects on current position, tag values. return FALSE; } if (offset > utext_nativeLength(fText)) { last(); // For side effects on current position, tag values. return FALSE; } // otherwise, we can use following() on the position before the specified // one and return true if the position we get back is the one the user // specified utext_previous32From(fText, offset); int32_t backOne = (int32_t)UTEXT_GETNATIVEINDEX(fText); UBool result = following(backOne) == offset; return result; }
/** * Sets the current iteration position to the end of the text. * @return The text's past-the-end offset. */ int32_t BreakIterator::last(void) { reset(); if (fText == NULL) { fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; return BreakIterator::DONE; } fLastStatusIndexValid = FALSE; int32_t pos = (int32_t)utext_nativeLength(fText); utext_setNativeIndex(fText, pos); return pos; }
//--------------------------------------------------------------------- // // pattern // //--------------------------------------------------------------------- UnicodeString RegexPattern::pattern() const { if (fPatternString != NULL) { return *fPatternString; } else if (fPattern == NULL) { return UnicodeString(); } else { UErrorCode status = U_ZERO_ERROR; int64_t nativeLen = utext_nativeLength(fPattern); int32_t len16 = utext_extract(fPattern, 0, nativeLen, NULL, 0, &status); // buffer overflow error UnicodeString result; status = U_ZERO_ERROR; UChar *resultChars = result.getBuffer(len16); utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning result.releaseBuffer(len16); return result; } }
/** * Sets the iterator to refer to the last boundary position before the * specified position. * @offset The position to begin searching for a break from. * @return The position of the last boundary before the starting position. */ int32_t BreakIterator::preceding(int32_t offset) { // if we have cached break positions and offset is in the range // covered by them, use them if (fCachedBreakPositions != NULL) { // TODO: binary search? // TODO: What if offset is outside range, but break is not? if (offset > fCachedBreakPositions[0] && offset <= fCachedBreakPositions[fNumCachedBreakPositions - 1]) { fPositionInCache = 0; while (fPositionInCache < fNumCachedBreakPositions && offset > fCachedBreakPositions[fPositionInCache]) ++fPositionInCache; --fPositionInCache; // If we're at the beginning of the cache, need to reevaluate the // rule status if (fPositionInCache <= 0) { fLastStatusIndexValid = FALSE; } utext_setNativeIndex(fText, fCachedBreakPositions[fPositionInCache]); return fCachedBreakPositions[fPositionInCache]; } else { reset(); } } // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset if (fText == NULL || offset > utext_nativeLength(fText)) { // return BreakIterator::DONE; return last(); } else if (offset < 0) { return first(); } // if we start by updating the current iteration position to the // position specified by the caller, we can just use previous() // to carry out this operation if (fSafeFwdTable != NULL) { // new rule syntax utext_setNativeIndex(fText, offset); int32_t newOffset = (int32_t)UTEXT_GETNATIVEINDEX(fText); if (newOffset != offset) { // Will come here if specified offset was not a code point boundary AND // the underlying implmentation is using UText, which snaps any non-code-point-boundary // indices to the containing code point. // For breakitereator::preceding only, these non-code-point indices need to be moved // up to refer to the following codepoint. UTEXT_NEXT32(fText); offset = (int32_t)UTEXT_GETNATIVEINDEX(fText); } // TODO: (synwee) would it be better to just check for being in the middle of a surrogate pair, // rather than adjusting the position unconditionally? // (Change would interact with safe rules.) // TODO: change RBBI behavior for off-boundary indices to match that of UText? // affects only preceding(), seems cleaner, but is slightly different. UTEXT_PREVIOUS32(fText); handleNext(fSafeFwdTable); int32_t result = (int32_t)UTEXT_GETNATIVEINDEX(fText); while (result >= offset) { result = previous(); } return result; } if (fSafeRevTable != NULL) { // backup plan if forward safe table is not available // TODO: check whether this path can be discarded // It's probably OK to say that rules must supply both safe tables // if they use safe tables at all. We have certainly never described // to anyone how to work with just one safe table. utext_setNativeIndex(fText, offset); UTEXT_NEXT32(fText); // handle previous will give result <= offset handlePrevious(fSafeRevTable); // next will give result 0 or 1 boundary away from offset, // most of the time // we have to int32_t oldresult = next(); while (oldresult < offset) { int32_t result = next(); if (result >= offset) { return oldresult; } oldresult = result; } int32_t result = previous(); if (result >= offset) { return previous(); } return result; } // old rule syntax utext_setNativeIndex(fText, offset); return previous(); }
/** * Sets the iterator to refer to the first boundary position following * the specified position. * @offset The position from which to begin searching for a break position. * @return The position of the first break after the current position. */ int32_t BreakIterator::following(int32_t offset) { // if we have cached break positions and offset is in the range // covered by them, use them // TODO: could use binary search // TODO: what if offset is outside range, but break is not? if (fCachedBreakPositions != NULL) { if (offset >= fCachedBreakPositions[0] && offset < fCachedBreakPositions[fNumCachedBreakPositions - 1]) { fPositionInCache = 0; // We are guaranteed not to leave the array due to range test above while (offset >= fCachedBreakPositions[fPositionInCache]) { ++fPositionInCache; } int32_t pos = fCachedBreakPositions[fPositionInCache]; utext_setNativeIndex(fText, pos); return pos; } else { reset(); } } // if the offset passed in is already past the end of the text, // just return DONE; if it's before the beginning, return the // text's starting offset fLastRuleStatusIndex = 0; fLastStatusIndexValid = TRUE; if (fText == NULL || offset >= utext_nativeLength(fText)) { last(); return next(); } else if (offset < 0) { return first(); } // otherwise, set our internal iteration position (temporarily) // to the position passed in. If this is the _beginning_ position, // then we can just use next() to get our return value int32_t result = 0; if (fSafeRevTable != NULL) { // new rule syntax utext_setNativeIndex(fText, offset); // move forward one codepoint to prepare for moving back to a // safe point. // this handles offset being between a supplementary character UTEXT_NEXT32(fText); // handlePrevious will move most of the time to < 1 boundary away handlePrevious(fSafeRevTable); int32_t result = next(); while (result <= offset) { result = next(); } return result; } if (fSafeFwdTable != NULL) { // backup plan if forward safe table is not available utext_setNativeIndex(fText, offset); UTEXT_PREVIOUS32(fText); // handle next will give result >= offset handleNext(fSafeFwdTable); // previous will give result 0 or 1 boundary away from offset, // most of the time // we have to int32_t oldresult = previous(); while (oldresult > offset) { int32_t result = previous(); if (result <= offset) { return oldresult; } oldresult = result; } int32_t result = next(); if (result <= offset) { return next(); } return result; } // otherwise, we have to sync up first. Use handlePrevious() to back // up to a known break position before the specified position (if // we can determine that the specified position is a break position, // we don't back up at all). This may or may not be the last break // position at or before our starting position. Advance forward // from here until we've passed the starting position. The position // we stop on will be the first break position after the specified one. // old rule syntax utext_setNativeIndex(fText, offset); if (offset==0 || offset==1 && utext_getNativeIndex(fText)==0) { return next(); } result = previous(); while (result != BreakIterator::DONE && result <= offset) { result = next(); } return result; }
static void TestAPI(void) { UErrorCode status = U_ZERO_ERROR; UBool gFailed = FALSE; (void)gFailed; /* Suppress set but not used warning. */ /* Open */ { UText utLoc = UTEXT_INITIALIZER; const char * cString = "\x61\x62\x63\x64"; UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UText *utb; UChar c; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); c = utext_next32(uta); TEST_ASSERT(c == 0x41); utb = utext_close(uta); TEST_ASSERT(utb == NULL); uta = utext_openUTF8(&utLoc, cString, -1, &status); TEST_SUCCESS(status); TEST_ASSERT(uta == &utLoc); uta = utext_close(&utLoc); TEST_ASSERT(uta == &utLoc); } /* utext_clone() */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; int64_t len; UText *uta; UText *utb; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); utb = utext_clone(NULL, uta, FALSE, FALSE, &status); TEST_SUCCESS(status); TEST_ASSERT(utb != NULL); TEST_ASSERT(utb != uta); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); utext_close(uta); utext_close(utb); } /* basic access functions */ { UChar uString[] = {0x41, 0x42, 0x43, 0}; UText *uta; UChar32 c; int64_t len; UBool b; int64_t i; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_ASSERT(uta!=NULL); TEST_SUCCESS(status); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==TRUE); len = utext_nativeLength(uta); TEST_ASSERT(len == u_strlen(uString)); b = utext_isLengthExpensive(uta); TEST_ASSERT(b==FALSE); c = utext_char32At(uta, 0); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = utext_previous32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[0]); c = utext_next32From(uta, 1); TEST_ASSERT(c==uString[1]); c = utext_next32From(uta, u_strlen(uString)); TEST_ASSERT(c==U_SENTINEL); c = utext_previous32From(uta, 2); TEST_ASSERT(c==uString[1]); i = utext_getNativeIndex(uta); TEST_ASSERT(i == 1); utext_setNativeIndex(uta, 0); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==1); b = utext_moveIndex32(uta, u_strlen(uString)-1); TEST_ASSERT(b==TRUE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); b = utext_moveIndex32(uta, 1); TEST_ASSERT(b==FALSE); i = utext_getNativeIndex(uta); TEST_ASSERT(i==u_strlen(uString)); utext_setNativeIndex(uta, 0); c = UTEXT_NEXT32(uta); TEST_ASSERT(c==uString[0]); c = utext_current32(uta); TEST_ASSERT(c==uString[1]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==uString[0]); c = UTEXT_PREVIOUS32(uta); TEST_ASSERT(c==U_SENTINEL); utext_close(uta); } { /* * UText opened on a NULL string with zero length */ UText *uta; UChar32 c; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); uta = utext_openUTF8(NULL, NULL, 0, &status); TEST_SUCCESS(status); c = UTEXT_NEXT32(uta); TEST_ASSERT(c == U_SENTINEL); utext_close(uta); } { /* * extract */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UChar buf[100]; int32_t i; /* Test pinning of input bounds */ UChar uString2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0}; UChar * uString2Ptr = uString2 + 5; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; i = utext_extract(uta, 0, 100, NULL, 0, &status); TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR); TEST_ASSERT(i == u_strlen(uString)); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, 0, 100, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString)); i = u_strcmp(uString, buf); TEST_ASSERT(i == 0); utext_close(uta); /* Test pinning of input bounds */ status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString2Ptr, -1, &status); TEST_SUCCESS(status); status = U_ZERO_ERROR; memset(buf, 0, sizeof(buf)); i = utext_extract(uta, -3, 20, buf, 100, &status); TEST_SUCCESS(status); TEST_ASSERT(i == u_strlen(uString2Ptr)); i = u_strcmp(uString2Ptr, buf); TEST_ASSERT(i == 0); utext_close(uta); } { /* * Copy, Replace, isWritable * Can't create an editable UText from plain C, so all we * can easily do is check that errors returned. */ UText *uta; UChar uString[] = {0x41, 0x42, 0x43, 0}; UBool b; status = U_ZERO_ERROR; uta = utext_openUChars(NULL, uString, -1, &status); TEST_SUCCESS(status); b = utext_isWritable(uta); TEST_ASSERT(b == FALSE); b = utext_hasMetaData(uta); TEST_ASSERT(b == FALSE); utext_replace(uta, 0, 1, /* start, limit */ uString, -1, /* replacement, replacement length */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_copy(uta, 0, 1, /* start, limit */ 2, /* destination index */ FALSE, /* move flag */ &status); TEST_ASSERT(status == U_NO_WRITE_PERMISSION); utext_close(uta); } }