Пример #1
0
/* Print each element in order: */
void printEachForward( UBreakIterator* boundary, UChar* str) {
  int32_t end;
  int32_t start = ubrk_first(boundary);
  for (end = ubrk_next(boundary); end != UBRK_DONE; start = end, end =
	 ubrk_next(boundary)) {
    printTextRange(str, start, end );
  }
}
Пример #2
0
static void TestBug11665(void) {
    // The problem was with the incorrect breaking of Japanese text beginning
    // with Katakana characters when no prior Japanese or Chinese text had been
    // encountered.
    //
    // Tested here in cintltst, rather than in intltest, because only cintltst
    // tests have the ability to reset ICU, which is needed to get the bug
    // to manifest itself.

    static UChar japaneseText[] = {0x30A2, 0x30EC, 0x30EB, 0x30AE, 0x30FC, 0x6027, 0x7D50, 0x819C, 0x708E};
    int32_t boundaries[10] = {0};
    UBreakIterator *bi = NULL;
    int32_t brk;
    int32_t brkIdx = 0;
    int32_t totalBreaks = 0;
    UErrorCode status = U_ZERO_ERROR;

    ctest_resetICU();
    bi = ubrk_open(UBRK_WORD, "en_US", japaneseText, UPRV_LENGTHOF(japaneseText), &status);
    TEST_ASSERT_SUCCESS(status);
    if (!bi) {
        return;
    }
    for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
        boundaries[brkIdx] = brk;
        if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
            break;
        }
    }
    if (brkIdx <= 2 || brkIdx >= UPRV_LENGTHOF(boundaries)) {
        log_err("%s:%d too few or many breaks found.\n", __FILE__, __LINE__);
    } else {
        totalBreaks = brkIdx;
        brkIdx = 0;
        for (brk=ubrk_first(bi); brk != UBRK_DONE; brk=ubrk_next(bi)) {
            if (brk != boundaries[brkIdx]) {
                log_err("%s:%d Break #%d differs between first and second iteration.\n", __FILE__, __LINE__, brkIdx);
                break;
            }
            if (++brkIdx >= UPRV_LENGTHOF(boundaries) - 1) {
                log_err("%s:%d Too many breaks.\n", __FILE__, __LINE__);
                break;
            }
        }
        if (totalBreaks != brkIdx) {
            log_err("%s:%d Number of breaks differ between first and second iteration.\n", __FILE__, __LINE__);
        }
    }
    ubrk_close(bi);
}
Пример #3
0
/* {{{ grapheme_extract_bytecount_iter - grapheme iterator for grapheme_extract MAXBYTES */
static inline int32_t
grapheme_extract_bytecount_iter(UBreakIterator *bi, int32_t bsize, unsigned char *pstr, int32_t str_len)
{
	int pos = 0, prev_pos = 0;
	int ret_pos = 0, prev_ret_pos = 0;

	while ( 1 ) {
		pos = ubrk_next(bi);

		if ( UBRK_DONE == pos ) {
			break;
		}

		prev_ret_pos = ret_pos;
		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);

		if ( ret_pos > bsize ) {
			ret_pos = prev_ret_pos;
			break;
		}

		if ( prev_ret_pos == ret_pos ) {
			/* something wrong - malformed utf8? */
			break;
		}

		prev_pos = pos;
	}

	return ret_pos;
}
unsigned numGraphemeClusters(StringView string)
{
    unsigned stringLength = string.length();
    
    if (!stringLength)
        return 0;

    // The only Latin-1 Extended Grapheme Cluster is CRLF.
    if (string.is8Bit()) {
        auto* characters = string.characters8();
        unsigned numCRLF = 0;
        for (unsigned i = 1; i < stringLength; ++i)
            numCRLF += characters[i - 1] == '\r' && characters[i] == '\n';
        return stringLength - numCRLF;
    }

    NonSharedCharacterBreakIterator iterator { string };
    if (!iterator) {
        ASSERT_NOT_REACHED();
        return stringLength;
    }

    unsigned numGraphemeClusters = 0;
    while (ubrk_next(iterator) != UBRK_DONE)
        ++numGraphemeClusters;
    return numGraphemeClusters;
}
Пример #5
0
/* {{{ grapheme_split_string: find and optionally return grapheme boundaries */
int grapheme_split_string(const UChar *text, int32_t text_length, int boundary_array[], int boundary_array_len TSRMLS_DC )
{
	unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
	UErrorCode		status = U_ZERO_ERROR;
	int ret_len, pos;
	UBreakIterator* bi;

	bi = grapheme_get_break_iterator((void*)u_break_iterator_buffer, &status TSRMLS_CC );

	if( U_FAILURE(status) ) {
		return -1;
	}
	
	ubrk_setText(bi, text, text_length,	&status);

	pos = 0;
	
	for ( ret_len = 0; pos != UBRK_DONE; ) {
	
		pos = ubrk_next(bi);
		
		if ( pos != UBRK_DONE ) {
		
			if ( NULL != boundary_array && ret_len < boundary_array_len ) {
				boundary_array[ret_len] = pos;
			}

			ret_len++;
		}
	}
	 		
	ubrk_close(bi);
	
	return ret_len;
}
Пример #6
0
/* {{{ grapheme_extract_charcount_iter - grapheme iterator for grapheme_extract MAXCHARS */
static inline int32_t
grapheme_extract_charcount_iter(UBreakIterator *bi, int32_t csize, unsigned char *pstr, int32_t str_len)
{
	int pos = 0, prev_pos = 0;
	int ret_pos = 0, prev_ret_pos = 0;

	while ( 1 ) {
		pos = ubrk_next(bi);

		if ( UBRK_DONE == pos ) {
			break;
		}

		/* if we are beyond our limit, then the loop is done */
		if ( pos > csize ) {
			break;
		}

		/* update our pointer in the original UTF-8 buffer by as many characters
		   as ubrk_next iterated over */

		prev_ret_pos = ret_pos;
		U8_FWD_N(pstr, ret_pos, str_len, pos - prev_pos);

		if ( prev_ret_pos == ret_pos ) {
			/* something wrong - malformed utf8? */
			break;
		}

		prev_pos = pos;
	}

	return ret_pos;
}
Пример #7
0
// BreakIterator.split {{{
static PyObject *
icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
    int32_t prev = 0, p = 0, sz = 0;
    PyObject *ans = NULL, *token = NULL;
  
    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz > 0) {
            token = icu_to_python(self->text + prev, sz);
            if (token == NULL) {
                Py_DECREF(ans); ans = NULL; break; 
            }
            if (PyList_Append(ans, token) != 0) {
                Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; 
            }
            Py_DECREF(token);
        }
    }

    return ans;

} // }}}
Пример #8
0
	std::vector<lstring> convert_split_words(const lstring &lt) {
		std::vector<lstring> ret;

		UBreakIterator* bi;
		int prev = -1, pos;

		UErrorCode err = U_ZERO_ERROR;
		bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err);
		if (U_FAILURE(err))
			return ret;

		pos = ubrk_first(bi);
		while (pos != UBRK_DONE) {
			int rules = ubrk_getRuleStatus(bi);
			if ((rules == UBRK_WORD_NONE) || (prev == -1)) {
				prev = pos;
			} else {
				ret.emplace_back(lt.substr(prev, pos - prev));

				prev = -1;
			}

			pos = ubrk_next(bi);
		}

		ubrk_close(bi);

		return ret;
	}
unsigned numCodeUnitsInGraphemeClusters(StringView string, unsigned numGraphemeClusters)
{
    unsigned stringLength = string.length();

    if (stringLength <= numGraphemeClusters)
        return stringLength;

    // The only Latin-1 Extended Grapheme Cluster is CRLF.
    if (string.is8Bit()) {
        auto* characters = string.characters8();
        unsigned i, j;
        for (i = 0, j = 0; i < numGraphemeClusters && j + 1 < stringLength; ++i, ++j)
            j += characters[j] == '\r' && characters[j + 1] == '\n';
        return j + (i < numGraphemeClusters);
    }

    NonSharedCharacterBreakIterator iterator { string };
    if (!iterator) {
        ASSERT_NOT_REACHED();
        return stringLength;
    }

    for (unsigned i = 0; i < numGraphemeClusters; ++i) {
        if (ubrk_next(iterator) == UBRK_DONE)
            return stringLength;
    }
    return ubrk_current(iterator);
}
static jint nextImpl(JNIEnv* env, jclass, jint address, jint n) {
    UBreakIterator* bi = breakIterator(address);
    if (n < 0) {
        while (n++ < -1) {
            ubrk_previous(bi);
        }
        return ubrk_previous(bi);
    } else if (n == 0) {
        return ubrk_current(bi);
    } else {
        while (n-- > 1) {
            ubrk_next(bi);
        }
        return ubrk_next(bi);
    }
    return -1;
}
Пример #11
0
// BreakIterator.index {{{
static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif

    UChar *buf = NULL, *needle = NULL;
    int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0;

    buf = python_to_icu(token, &sz);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;
    needle = buf;
    if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; }
    if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1;

    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            continue;  // We are not at the start of a word

        if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) {
            if (word_start > 0 && (
                    (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) ||
                    (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1]))
            )) continue;
            if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue;

            if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; }

            if (
                    // Check that the found word is followed by a word boundary
                    ubrk_isBoundary(self->break_iterator, word_start + sz) &&
                    // If there is a leading hyphen check  that the leading
                    // hyphen is preceded by a word boundary
                    (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) &&
                    // Check that there is a word boundary *after* the trailing
                    // hyphen. We cannot rely on ubrk_isBoundary() as that
                    // always returns true because of the trailing hyphen.
                    (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            ) { ans = word_start; break; }

            if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary()
        }
    }
    if (leading_hyphen && ans > -1) ans -= 1;
#ifdef Py_UNICODE_WIDE
    if (ans > 0) ans = u_countChar32(self->text, ans);
#endif
    Py_END_ALLOW_THREADS;

end:
    free(buf);
    return Py_BuildValue("l", (long)ans);

} // }}}
Пример #12
0
/*
 *  static void TestBreakIteratorUText(void);
 *
 *         Test that ubrk_setUText() is present and works for a simple case.
 */
static void TestBreakIteratorUText(void) {
    const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67";  /* c3 85 is utf-8 for A with a ring on top */
                      /*   0  1   2 34567890  */

    UErrorCode      status = U_ZERO_ERROR;
    UBreakIterator *bi     = NULL;
    int32_t         pos    = 0;


    UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status);
    TEST_ASSERT_SUCCESS(status);

    bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status);
    if (U_FAILURE(status)) {
        log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
        return;
    }

    ubrk_setUText(bi, ut, &status);
    if (U_FAILURE(status)) {
        log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
        return;
    }

    pos = ubrk_first(bi);
    TEST_ASSERT(pos == 0);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 4);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 5);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 10);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == UBRK_DONE);
    ubrk_close(bi);
    utext_close(ut);
}
Пример #13
0
static void TestBreakIteratorTailoring(void) {
    const RBBITailoringTest * testPtr;
    for (testPtr = tailoringTests; testPtr->locale != NULL; ++testPtr) {
        UErrorCode status = U_ZERO_ERROR;
        UBreakIterator* ubrkiter = ubrk_open(testPtr->type, testPtr->locale, testPtr->test, -1, &status);
        if ( U_SUCCESS(status) ) {
            int32_t offset, offsindx;
            UBool foundError;

            foundError = FALSE;
            for (offsindx = 0; (offset = ubrk_next(ubrkiter)) != UBRK_DONE; ++offsindx) {
                if (!foundError && offsindx >= testPtr->numOffsets) {
                    log_err("FAIL: locale %s, break type %d, ubrk_next expected UBRK_DONE, got %d\n",
                            testPtr->locale, testPtr->type, offset);
                    foundError = TRUE;
                } else if (!foundError && offset != testPtr->offsFwd[offsindx]) {
                    log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got %d\n",
                            testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx], offset);
                    foundError = TRUE;
                }
            }
            if (!foundError && offsindx < testPtr->numOffsets) {
                log_err("FAIL: locale %s, break type %d, ubrk_next expected %d, got UBRK_DONE\n",
                    	testPtr->locale, testPtr->type, testPtr->offsFwd[offsindx]);
            }

            foundError = FALSE;
            for (offsindx = 0; (offset = ubrk_previous(ubrkiter)) != UBRK_DONE; ++offsindx) {
                if (!foundError && offsindx >= testPtr->numOffsets) {
                    log_err("FAIL: locale %s, break type %d, ubrk_previous expected UBRK_DONE, got %d\n",
                            testPtr->locale, testPtr->type, offset);
                    foundError = TRUE;
                } else if (!foundError && offset != testPtr->offsRev[offsindx]) {
                    log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got %d\n",
                            testPtr->locale, testPtr->type, testPtr->offsRev[offsindx], offset);
                    foundError = TRUE;
                }
            }
            if (!foundError && offsindx < testPtr->numOffsets) {
                log_err("FAIL: locale %s, break type %d, ubrk_previous expected %d, got UBRK_DONE\n",
                    	testPtr->locale, testPtr->type, testPtr->offsRev[offsindx]);
            }

            ubrk_close(ubrkiter);
        } else {
            log_err_status(status, "FAIL: locale %s, break type %d, ubrk_open status: %s\n", testPtr->locale, testPtr->type, u_errorName(status));
        }
    }
}
Пример #14
0
/*
 *  TestBreakIteratorRules - Verify that a break iterator can be created from
 *                           a set of source rules.
 */
static void TestBreakIteratorRules() {
    /*  Rules will keep together any run of letters not including 'a', OR
     *             keep together 'abc', but only when followed by 'def', OTHERWISE
     *             just return one char at a time.
     */
    char         rules[]  = "abc{666}/def;\n   [\\p{L} - [a]]* {2};  . {1};";
    /*                        0123456789012345678 */
    char         data[]   =  "abcdex abcdefgh-def";     /* the test data string                     */
    char         breaks[] =  "**    **  *    **  *";    /*  * the expected break positions          */
    char         tags[]   =  "01    21  6    21  2";    /*  expected tag values at break positions  */
    int32_t      tagMap[] = {0, 1, 2, 3, 4, 5, 666};

    UChar       *uData;
    void        *freeHook = NULL;
    UErrorCode   status   = U_ZERO_ERROR;
    int32_t      pos;
    int          i;

    UBreakIterator *bi = testOpenRules(rules);
    if (bi == NULL) {return;}
    uData = toUChar(data, &freeHook);
    ubrk_setText(bi,  uData, -1, &status);

    pos = ubrk_first(bi);
    for (i=0; i<sizeof(breaks); i++) {
        if (pos == i && breaks[i] != '*') {
            log_err("FAIL: unexpected break at position %d found\n", pos);
            break;
        }
        if (pos != i && breaks[i] == '*') {
            log_err("FAIL: expected break at position %d not found.\n", i);
            break;
        }
        if (pos == i) {
            int32_t tag, expectedTag;
            tag = ubrk_getRuleStatus(bi);
            expectedTag = tagMap[tags[i]&0xf];
            if (tag != expectedTag) {
                log_err("FAIL: incorrect tag value.  Position = %d;  expected tag %d, got %d",
                    pos, expectedTag, tag);
                break;
            }
            pos = ubrk_next(bi);
        }
    }

    freeToUCharStrings(&freeHook);
    ubrk_close(bi);
}
Пример #15
0
MojErr MojDbTextTokenizer::tokenize(const MojString& text, MojDbTextCollator* collator, KeySet& keysOut) const
{
    LOG_TRACE("Entering function %s", __FUNCTION__);
    MojAssert(m_ubrk.get());

    // convert to UChar from str
    MojDbTextUtils::UnicodeVec unicodeStr;
    MojErr err = MojDbTextUtils::strToUnicode(text, unicodeStr);
    MojErrCheck(err);

    // clone break iterator and set text
    MojByte buf[U_BRK_SAFECLONE_BUFFERSIZE];
    UErrorCode status = U_ZERO_ERROR;
    MojInt32 size = sizeof(buf);
    IterPtr ubrk(ubrk_safeClone(m_ubrk.get(), buf, &size, &status));
    MojUnicodeErrCheck(status);
    MojAssert(ubrk.get());
    ubrk_setText(ubrk.get(), unicodeStr.begin(), (MojInt32) unicodeStr.size(), &status);
    MojUnicodeErrCheck(status);

    MojInt32 tokBegin = -1;
    MojInt32 pos = ubrk_first(ubrk.get());
    while (pos != UBRK_DONE) {
        UWordBreak status = (UWordBreak) ubrk_getRuleStatus(ubrk.get());
        if (status != UBRK_WORD_NONE) {
            MojAssert(tokBegin != -1);
            MojDbKey key;
            const UChar* tokChars = unicodeStr.begin() + tokBegin;
            MojSize tokSize = (MojSize) (pos - tokBegin);
            if (collator) {
                err = collator->sortKey(tokChars, tokSize, key);
                MojErrCheck(err);
            } else {
                MojString tok;
                err = MojDbTextUtils::unicodeToStr(tokChars, tokSize, tok);
                MojErrCheck(err);
                err = key.assign(tok);
                MojErrCheck(err);
            }
            err = keysOut.put(key);
            MojErrCheck(err);
        }
        tokBegin = pos;
        pos = ubrk_next(ubrk.get());
    }
    return MojErrNone;
}
Пример #16
0
int tokenizer_next( tokenizer_t *t, char *word, size_t size ) {
  UChar   savedEndChar;
  int k;

  // start iterator
  if( t->end == 0 ) {
    t->start = ubrk_first(t->boundary);
  }

  // Find next word
again:
  t->end = ubrk_next(t->boundary);
  if( t->end == UBRK_DONE ) {
    return -1;
  }

	// Null terminate
  savedEndChar = t->str[t->end];
  t->str[t->end] = 0;

	// Skip unct
	if( t->end - t->start == 1 && u_ispunct( t->str[t->start] ) ) {
    t->str[t->end] = savedEndChar;
    t->start = t->end;
    goto again;
	}

	// Skip whitespace
	for( k=t->start; k<t->end; k++ ) {
	  if( u_isspace( t->str[k] ) == 1 ) {
      t->str[t->end] = savedEndChar;
      t->start = t->end;
			goto again;
		}
  }

	// Copy to C bffer
  u_austrncpy(word, t->str+t->start, size-1);
  word[size-1] = 0;

  printf("string[%2d..%2d] \"%s\" %d\n", t->start, t->end-1, word, u_isspace( t->str[t->start])); 
  t->str[t->end] = savedEndChar;
  t->start = t->end;
  return 0;
}
Пример #17
0
/*
*   TestsBreakIteratorStatusVals()   Test the ubrk_getRuleStatusVec() funciton
*/
static void TestBreakIteratorStatusVec() {
    #define RULE_STRING_LENGTH 200
    UChar          rules[RULE_STRING_LENGTH];

    #define TEST_STRING_LENGTH 25
    UChar           testString[TEST_STRING_LENGTH];
    UBreakIterator *bi        = NULL;
    int32_t         pos       = 0;
    int32_t         vals[10];
    int32_t         numVals;
    UErrorCode      status    = U_ZERO_ERROR;

    u_uastrncpy(rules,  "[A-N]{100}; \n"
                             "[a-w]{200}; \n"
                             "[\\p{L}]{300}; \n"
                             "[\\p{N}]{400}; \n"
                             "[0-5]{500}; \n"
                              "!.*;\n", RULE_STRING_LENGTH);
    u_uastrncpy(testString, "ABC", TEST_STRING_LENGTH);


    bi = ubrk_openRules(rules, -1, testString, -1, NULL, &status);
    TEST_ASSERT_SUCCESS(status);
    TEST_ASSERT(bi != NULL);

    /* The TEST_ASSERT above should change too... */
    if (bi != NULL) {
        pos = ubrk_next(bi);
        TEST_ASSERT(pos == 1);

        memset(vals, -1, sizeof(vals));
        numVals = ubrk_getRuleStatusVec(bi, vals, 10, &status);
        TEST_ASSERT_SUCCESS(status);
        TEST_ASSERT(numVals == 2);
        TEST_ASSERT(vals[0] == 100);
        TEST_ASSERT(vals[1] == 300);
        TEST_ASSERT(vals[2] == -1);

        numVals = ubrk_getRuleStatusVec(bi, vals, 0, &status);
        TEST_ASSERT(status == U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(numVals == 2);
    }

    ubrk_close(bi);
}
Пример #18
0
static void TestBreakIteratorRefresh(void) {
    /*
     *  RefreshInput changes out the input of a Break Iterator without
     *    changing anything else in the iterator's state.  Used with Java JNI,
     *    when Java moves the underlying string storage.   This test
     *    runs a ubrk_next() repeatedly, moving the text in the middle of the sequence.
     *    The right set of boundaries should still be found.
     */
    UChar testStr[]  = {0x20, 0x41, 0x20, 0x42, 0x20, 0x43, 0x20, 0x44, 0x0};  /* = " A B C D"  */
    UChar movedStr[] = {0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20,  0};
    UErrorCode status = U_ZERO_ERROR;
    UBreakIterator *bi;
    UText ut1 = UTEXT_INITIALIZER;
    UText ut2 = UTEXT_INITIALIZER;
    
    bi = ubrk_open(UBRK_LINE, "en_US", NULL, 0, &status);
    TEST_ASSERT_SUCCESS(status);
    if (U_FAILURE(status)) {
        return;
    }

    utext_openUChars(&ut1, testStr, -1, &status);
    TEST_ASSERT_SUCCESS(status);
    ubrk_setUText(bi, &ut1, &status);
    TEST_ASSERT_SUCCESS(status);

    if (U_SUCCESS(status)) {
        /* Line boundaries will occur before each letter in the original string */
        TEST_ASSERT(1 == ubrk_next(bi));
        TEST_ASSERT(3 == ubrk_next(bi));

        /* Move the string, kill the original string.  */
        u_strcpy(movedStr, testStr);
        u_memset(testStr, 0x20, u_strlen(testStr));
        utext_openUChars(&ut2, movedStr, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        ubrk_refreshUText(bi, &ut2, &status);
        TEST_ASSERT_SUCCESS(status);
    
        /* Find the following matches, now working in the moved string. */
        TEST_ASSERT(5 == ubrk_next(bi));
        TEST_ASSERT(7 == ubrk_next(bi));
        TEST_ASSERT(8 == ubrk_next(bi));
        TEST_ASSERT(UBRK_DONE == ubrk_next(bi));
        TEST_ASSERT_SUCCESS(status);

        utext_close(&ut1);
        utext_close(&ut2);
    }
    ubrk_close(bi);
}
Пример #19
0
ERL_NIF_TERM len(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
    ErlNifBinary in;
    cloner* ptr; 
    UBreakIterator* iter; 
    int count = 0, pos;
    UErrorCode status = U_ZERO_ERROR;

    if (argc != 2)
        return enif_make_badarg(env);

    /* Last argument must be a binary */
    if (!(enif_inspect_binary(env, argv[1], &in)
       && enif_get_resource(env, argv[0], iterator_type, (void**) &ptr))) {
        return enif_make_badarg(env);
    }    
    iter = (UBreakIterator*) cloner_get(ptr);
    CHECK_RES(env, iter);

    if (iter == NULL) {
        return enif_make_badarg(env);
    }

    /* Do count */
    ubrk_setText(iter,
        (UChar *) in.data,
        TO_ULEN(in.size),
        &status);
    CHECK(env, status);

    pos = ubrk_first(iter);
    if (pos != UBRK_DONE)
        while (1) {
            pos = ubrk_next(iter);
            if (pos == UBRK_DONE)
                break;

            if (is_valid_elem(ptr, iter))
                count++;
        }

    return enif_make_int(env, count);
}
Пример #20
0
/* {{{ grapheme_count_graphemes */
int32_t grapheme_count_graphemes(UBreakIterator *bi, UChar *string, int32_t string_len)
{
	int ret_len = 0;
	int pos = 0;
	UErrorCode		status = U_ZERO_ERROR;
	
	ubrk_setText(bi, string, string_len, &status);

	do {
	
		pos = ubrk_next(bi);
		
		if ( UBRK_DONE != pos ) {
			ret_len++;
		}
		
	} while ( UBRK_DONE != pos );
	
	return ret_len;
}
Пример #21
0
Файл: utr.c Проект: julp/ugrep
int32_t grapheme_count(UBreakIterator *ubrk, const UString *ustr)
{
    int32_t i, count;
    UErrorCode status;

    count = 0;
    status = U_ZERO_ERROR;
    ubrk_setText(ubrk, ustr->ptr, ustr->len, &status);
    if (U_FAILURE(status)) {
        return -1;
    }
    if (UBRK_DONE != (i = ubrk_first(ubrk))) {
        while (UBRK_DONE != (i = ubrk_next(ubrk))) {
            ++count;
        }
    }
    ubrk_unbindText(ubrk);

    return count;
}
Пример #22
0
// BreakIterator.index {{{
static PyObject *
icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    UChar *buf = NULL;
    int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1;
    PyObject *token = NULL;
  
    if (!PyArg_ParseTuple(args, "O", &token)) return NULL;
    buf = python_to_icu(token, &sz, 1);
    if (buf == NULL) return NULL;
    if (sz < 1) goto end;

    Py_BEGIN_ALLOW_THREADS;
    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { 
#ifdef PY_UNICODE_WIDE
            ans = u_countChar32(self->text, prev);
#else
            ans = prev; 
#endif
            break;
        }
    }
    Py_END_ALLOW_THREADS;

end:
    free(buf);
    return Py_BuildValue("i", ans);

} // }}}
Пример #23
0
// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) {
#if PY_VERSION_HEX >= 0x03030000 
#error Not implemented for python >= 3.3
#endif

    int32_t prev = 0, p = 0, sz = 0;
    PyObject *ans = NULL, *temp = NULL;
  
    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        prev = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) 
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev;
        if (sz > 0) {
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + prev, sz);
            prev = u_countChar32(self->text, prev);
#endif
            temp = Py_BuildValue("II", prev, sz); 
            if (temp == NULL) {
                Py_DECREF(ans); ans = NULL; break; 
            } 
            if (PyList_Append(ans, temp) != 0) {
                Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; 
            }
            Py_DECREF(temp);
        }
    }

    return ans;

} // }}}
Пример #24
0
/* {{{ grapheme_extract_count_iter - grapheme iterator for grapheme_extract COUNT */
static inline int32_t
grapheme_extract_count_iter(UBreakIterator *bi, int32_t size, unsigned char *pstr, int32_t str_len)
{
	int pos = 0, next_pos = 0;
	int ret_pos = 0;

	while ( size ) {
		next_pos = ubrk_next(bi);

		if ( UBRK_DONE == next_pos ) {
			break;
		}
		pos = next_pos;
		size--;
	}

	/* pos is one past the last UChar - and represent the number of code units to
		advance in the utf-8 buffer
	*/

	U8_FWD_N(pstr, ret_pos, str_len, pos);

	return ret_pos;
}
Пример #25
0
/*
** Extract the next token from a tokenization cursor.
*/
static int icuNext(
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
  const char **ppToken,               /* OUT: *ppToken is the token text */
  int *pnBytes,                       /* OUT: Number of bytes in token */
  int *piStartOffset,                 /* OUT: Starting offset of token */
  int *piEndOffset,                   /* OUT: Ending offset of token */
  int *piPosition                     /* OUT: Position integer of token */
){
  IcuCursor *pCsr = (IcuCursor *)pCursor;

  int iStart = 0;
  int iEnd = 0;
  int nByte = 0;

  while( iStart==iEnd ){
    UChar32 c;

    iStart = ubrk_current(pCsr->pIter);
    iEnd = ubrk_next(pCsr->pIter);
    if( iEnd==UBRK_DONE ){
      return SQLITE_DONE;
    }

    while( iStart<iEnd ){
      int iWhite = iStart;
      U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
      if( u_isspace(c) ){
        iStart = iWhite;
      }else{
        break;
      }
    }
    assert(iStart<=iEnd);
  }

  do {
    UErrorCode status = U_ZERO_ERROR;
    if( nByte ){
      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
      if( !zNew ){
        return SQLITE_NOMEM;
      }
      pCsr->zBuffer = zNew;
      pCsr->nBuffer = nByte;
    }

    u_strToUTF8(
        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
        &status                                  /* Output success/failure */
    );
  } while( nByte>pCsr->nBuffer );

  *ppToken = pCsr->zBuffer;
  *pnBytes = nByte;
  *piStartOffset = pCsr->aOffset[iStart];
  *piEndOffset = pCsr->aOffset[iEnd];
  *piPosition = pCsr->iToken++;

  return SQLITE_OK;
}
Пример #26
0
/*
 * Internal titlecasing function.
 *
 * Must get titleIter!=NULL.
 */
static int32_t
_toTitle(const UCaseProps *csp,
         UChar *dest, int32_t destCapacity,
         const UChar *src, UCaseContext *csc,
         int32_t srcLength,
         UBreakIterator *titleIter,
         const char *locale, int32_t *locCache,
         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c;
    int32_t prev, titleStart, titleLimit, index, destIndex, length;
    UBool isFirstIndex;

    /* set up local variables */
    destIndex=0;
    prev=0;
    isFirstIndex=TRUE;

    /* titlecasing loop */
    while(prev<srcLength) {
        /* find next index where to titlecase */
        if(isFirstIndex) {
            isFirstIndex=FALSE;
            index=ubrk_first(titleIter);
        } else {
            index=ubrk_next(titleIter);
        }
        if(index==UBRK_DONE || index>srcLength) {
            index=srcLength;
        }

        /*
         * Unicode 4 & 5 section 3.13 Default Case Operations:
         *
         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
         * cased character F. If F exists, map F to default_title(F); then map each
         * subsequent character C to default_lower(C).
         *
         * In this implementation, segment [prev..index[ into 3 parts:
         * a) uncased characters (copy as-is) [prev..titleStart[
         * b) first case letter (titlecase)         [titleStart..titleLimit[
         * c) subsequent characters (lowercase)                 [titleLimit..index[
         */
        if(prev<index) {
            /* find and copy uncased characters [prev..titleStart[ */
            titleStart=titleLimit=prev;
            for(;;) {
                U16_NEXT(src, titleLimit, srcLength, c);
                if(UCASE_NONE!=ucase_getType(csp, c)) {
                    break; /* cased letter at [titleStart..titleLimit[ */
                }
                titleStart=titleLimit;
                if(titleLimit==index) {
                    /*
                     * only uncased characters in [prev..index[
                     * stop with titleStart==titleLimit==index
                     */
                    break;
                }
            }
            length=titleStart-prev;
            if(length>0) {
                if((destIndex+length)<=destCapacity) {
                    uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
                }
                destIndex+=length;
            }

            if(titleStart<titleLimit) {
                /* titlecase c which is from [titleStart..titleLimit[ */
                csc->cpStart=titleStart;
                csc->cpLimit=titleLimit;
                c=ucase_toFullTitle(csp, c, utf16_caseContextIterator, csc, &s, locale, locCache);
                destIndex=appendResult(dest, destIndex, destCapacity, c, s);

                /* lowercase [titleLimit..index[ */
                if(titleLimit<index) {
                    destIndex+=
                        _caseMap(
                            csp, ucase_toFullLower,
                            dest+destIndex, destCapacity-destIndex,
                            src, csc,
                            titleLimit, index,
                            locale, locCache,
                            pErrorCode);
                }
            }
        }

        prev=index;
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
Пример #27
0
/*
 * Internal titlecasing function.
 */
static int32_t
_toTitle(UCaseMap *csm,
         UChar *dest, int32_t destCapacity,
         const UChar *src, UCaseContext *csc,
         int32_t srcLength,
         UErrorCode *pErrorCode) {
    const UChar *s;
    UChar32 c;
    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
    UBool isFirstIndex;

    if(csm->iter!=NULL) {
        ubrk_setText(csm->iter, src, srcLength, pErrorCode);
    } else {
        csm->iter=ubrk_open(UBRK_WORD, csm->locale,
                            src, srcLength,
                            pErrorCode);
    }
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }

    /* set up local variables */
    destIndex=0;
    prev=0;
    isFirstIndex=TRUE;

    /* titlecasing loop */
    while(prev<srcLength) {
        /* find next index where to titlecase */
        if(isFirstIndex) {
            isFirstIndex=FALSE;
            idx=ubrk_first(csm->iter);
        } else {
            idx=ubrk_next(csm->iter);
        }
        if(idx==UBRK_DONE || idx>srcLength) {
            idx=srcLength;
        }

        /*
         * Unicode 4 & 5 section 3.13 Default Case Operations:
         *
         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
         * cased character F. If F exists, map F to default_title(F); then map each
         * subsequent character C to default_lower(C).
         *
         * In this implementation, segment [prev..index[ into 3 parts:
         * a) uncased characters (copy as-is) [prev..titleStart[
         * b) first case letter (titlecase)         [titleStart..titleLimit[
         * c) subsequent characters (lowercase)                 [titleLimit..index[
         */
        if(prev<idx) {
            /* find and copy uncased characters [prev..titleStart[ */
            titleStart=titleLimit=prev;
            U16_NEXT(src, titleLimit, idx, c);
            if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
                /* Adjust the titlecasing index (titleStart) to the next cased character. */
                for(;;) {
                    titleStart=titleLimit;
                    if(titleLimit==idx) {
                        /*
                         * only uncased characters in [prev..index[
                         * stop with titleStart==titleLimit==index
                         */
                        break;
                    }
                    U16_NEXT(src, titleLimit, idx, c);
                    if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
                        break; /* cased letter at [titleStart..titleLimit[ */
                    }
                }
                length=titleStart-prev;
                if(length>0) {
                    if((destIndex+length)<=destCapacity) {
                        uprv_memcpy(dest+destIndex, src+prev, length*U_SIZEOF_UCHAR);
                    }
                    destIndex+=length;
                }
            }

            if(titleStart<titleLimit) {
                /* titlecase c which is from [titleStart..titleLimit[ */
                csc->cpStart=titleStart;
                csc->cpLimit=titleLimit;
                c=ucase_toFullTitle(csm->csp, c, utf16_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
                destIndex=appendResult(dest, destIndex, destCapacity, c, s); 

                /* Special case Dutch IJ titlecasing */
                if ( titleStart+1 < idx && 
                     ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
                     ( src[titleStart] == (UChar32) 0x0049 || src[titleStart] == (UChar32) 0x0069 ) &&
                     ( src[titleStart+1] == (UChar32) 0x004A || src[titleStart+1] == (UChar32) 0x006A )) { 
                            c=(UChar32) 0x004A;
                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
                            titleLimit++;
                }

                /* lowercase [titleLimit..index[ */
                if(titleLimit<idx) {
                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
                        /* Normal operation: Lowercase the rest of the word. */
                        destIndex+=
                            _caseMap(
                                csm, ucase_toFullLower,
                                dest+destIndex, destCapacity-destIndex,
                                src, csc,
                                titleLimit, idx,
                                pErrorCode);
                    } else {
                        /* Optionally just copy the rest of the word unchanged. */
                        length=idx-titleLimit;
                        if((destIndex+length)<=destCapacity) {
                            uprv_memcpy(dest+destIndex, src+titleLimit, length*U_SIZEOF_UCHAR);
                        }
                        destIndex+=length;
                    }
                }
            }
        }

        prev=idx;
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    return destIndex;
}
Пример #28
0
// BreakIterator.split2 {{{
static PyObject *
icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) {
#if PY_VERSION_HEX >= 0x03030000
#error Not implemented for python >= 3.3
#endif

    int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0;
    int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0;
    UChar sep = 0;
    PyObject *ans = NULL, *temp = NULL, *t = NULL;

    ans = PyList_New(0);
    if (ans == NULL) return PyErr_NoMemory();

    p = ubrk_first(self->break_iterator);
    while (p != UBRK_DONE) {
        word_start = p; p = ubrk_next(self->break_iterator);
        if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE)
            continue;  // We are not at the start of a word
        sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start;
        if (sz > 0) {
            // ICU breaks on words containing hyphens, we do not want that, so we recombine manually
            is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0;
            if (word_start > 0) { // Look for a leading hyphen
                sep = *(self->text + word_start - 1);
                if (IS_HYPHEN_CHAR(sep)) {
                    leading_hyphen = 1;
                    if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1;
                }
            }
            if (word_start + sz < self->text_len) { // Look for a trailing hyphen
                sep = *(self->text + word_start + sz);
                if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1;
            }
            last_pos = p;
#ifdef Py_UNICODE_WIDE
            sz = u_countChar32(self->text + word_start, sz);
            word_start = u_countChar32(self->text, word_start);
#endif
            if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) {
                sz = last_sz + sz + trailing_hyphen;
                last_sz = sz;
                t = PyInt_FromLong((long)sz);
                if (t == NULL) { Py_DECREF(ans); ans = NULL; break; }
                temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1);
                Py_DECREF(PyTuple_GET_ITEM(temp, 1));
                PyTuple_SET_ITEM(temp, 1, t);
            } else {
                sz += leading_hyphen + trailing_hyphen;
                last_sz = sz;
                temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz);
                if (temp == NULL) {
                    Py_DECREF(ans); ans = NULL; break;
                }
                if (PyList_Append(ans, temp) != 0) {
                    Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break;
                }
                Py_DECREF(temp);
            }
        }
    }

    return ans;

} // }}}
int textBreakNext(TextBreakIterator* iterator)
{
    return ubrk_next(reinterpret_cast<UBreakIterator*>(iterator));
}
Пример #30
0
/* Print first element */
void printFirst(UBreakIterator* boundary, UChar* str) {
  int32_t end;
  int32_t start = ubrk_first(boundary);
  end = ubrk_next(boundary);
  printTextRange( str, start, end );
}