std::vector<lstring> convert_split_words(const lstring <) { std::vector<lstring> ret; UBreakIterator* bi; int prev = -1, pos; UErrorCode err = U_ZERO_ERROR; bi = ubrk_open(UBRK_WORD, get_locale(), (UChar *)lt.data(), lt.size(), &err); if (U_FAILURE(err)) return ret; pos = ubrk_first(bi); while (pos != UBRK_DONE) { int rules = ubrk_getRuleStatus(bi); if ((rules == UBRK_WORD_NONE) || (prev == -1)) { prev = pos; } else { ret.emplace_back(lt.substr(prev, pos - prev)); prev = -1; } pos = ubrk_next(bi); } ubrk_close(bi); return ret; }
// BreakIterator.split {{{ static PyObject * icu_BreakIterator_split(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { int32_t prev = 0, p = 0, sz = 0; PyObject *ans = NULL, *token = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { token = icu_to_python(self->text + prev, sz); if (token == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, token) != 0) { Py_DECREF(token); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(token); } } return ans; } // }}}
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *token) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL, *needle = NULL; int32_t word_start = 0, p = 0, sz = 0, ans = -1, leading_hyphen = 0, trailing_hyphen = 0; buf = python_to_icu(token, &sz); if (buf == NULL) return NULL; if (sz < 1) goto end; needle = buf; if (sz > 1 && IS_HYPHEN_CHAR(buf[0])) { needle = buf + 1; leading_hyphen = 1; sz -= 1; } if (sz > 1 && IS_HYPHEN_CHAR(buf[sz-1])) trailing_hyphen = 1; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word if (self->text_len >= word_start + sz && memcmp(self->text + word_start, needle, sz * sizeof(UChar)) == 0) { if (word_start > 0 && ( (leading_hyphen && !IS_HYPHEN_CHAR(self->text[word_start-1])) || (!leading_hyphen && IS_HYPHEN_CHAR(self->text[word_start-1])) )) continue; if (!trailing_hyphen && IS_HYPHEN_CHAR(self->text[word_start + sz])) continue; if (p == UBRK_DONE || self->text_len <= word_start + sz) { ans = word_start; break; } if ( // Check that the found word is followed by a word boundary ubrk_isBoundary(self->break_iterator, word_start + sz) && // If there is a leading hyphen check that the leading // hyphen is preceded by a word boundary (!leading_hyphen || (word_start > 1 && ubrk_isBoundary(self->break_iterator, word_start - 2))) && // Check that there is a word boundary *after* the trailing // hyphen. We cannot rely on ubrk_isBoundary() as that // always returns true because of the trailing hyphen. (!trailing_hyphen || ubrk_following(self->break_iterator, word_start + sz) == UBRK_DONE || ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) ) { ans = word_start; break; } if (p != UBRK_DONE) ubrk_isBoundary(self->break_iterator, p); // Reset the iterator to its position before the call to ubrk_isBoundary() } } if (leading_hyphen && ans > -1) ans -= 1; #ifdef Py_UNICODE_WIDE if (ans > 0) ans = u_countChar32(self->text, ans); #endif Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("l", (long)ans); } // }}}
/** * Is current element in this iterator valid? * If result is false, the element will be skipped. * Used for word_only. */ inline int is_valid_elem(cloner* res, UBreakIterator* iter) { cloner_break* res_brk = (cloner_break*) res; int32_t type; if (res_brk->skip_elem == NULL) return 1; type = ubrk_getRuleStatus(iter); return !((res_brk->skip_elem)(type)); }
/* * TestBreakIteratorRules - Verify that a break iterator can be created from * a set of source rules. */ static void TestBreakIteratorRules() { /* Rules will keep together any run of letters not including 'a', OR * keep together 'abc', but only when followed by 'def', OTHERWISE * just return one char at a time. */ char rules[] = "abc{666}/def;\n [\\p{L} - [a]]* {2}; . {1};"; /* 0123456789012345678 */ char data[] = "abcdex abcdefgh-def"; /* the test data string */ char breaks[] = "** ** * ** *"; /* * the expected break positions */ char tags[] = "01 21 6 21 2"; /* expected tag values at break positions */ int32_t tagMap[] = {0, 1, 2, 3, 4, 5, 666}; UChar *uData; void *freeHook = NULL; UErrorCode status = U_ZERO_ERROR; int32_t pos; int i; UBreakIterator *bi = testOpenRules(rules); if (bi == NULL) {return;} uData = toUChar(data, &freeHook); ubrk_setText(bi, uData, -1, &status); pos = ubrk_first(bi); for (i=0; i<sizeof(breaks); i++) { if (pos == i && breaks[i] != '*') { log_err("FAIL: unexpected break at position %d found\n", pos); break; } if (pos != i && breaks[i] == '*') { log_err("FAIL: expected break at position %d not found.\n", i); break; } if (pos == i) { int32_t tag, expectedTag; tag = ubrk_getRuleStatus(bi); expectedTag = tagMap[tags[i]&0xf]; if (tag != expectedTag) { log_err("FAIL: incorrect tag value. Position = %d; expected tag %d, got %d", pos, expectedTag, tag); break; } pos = ubrk_next(bi); } } freeToUCharStrings(&freeHook); ubrk_close(bi); }
MojErr MojDbTextTokenizer::tokenize(const MojString& text, MojDbTextCollator* collator, KeySet& keysOut) const { LOG_TRACE("Entering function %s", __FUNCTION__); MojAssert(m_ubrk.get()); // convert to UChar from str MojDbTextUtils::UnicodeVec unicodeStr; MojErr err = MojDbTextUtils::strToUnicode(text, unicodeStr); MojErrCheck(err); // clone break iterator and set text MojByte buf[U_BRK_SAFECLONE_BUFFERSIZE]; UErrorCode status = U_ZERO_ERROR; MojInt32 size = sizeof(buf); IterPtr ubrk(ubrk_safeClone(m_ubrk.get(), buf, &size, &status)); MojUnicodeErrCheck(status); MojAssert(ubrk.get()); ubrk_setText(ubrk.get(), unicodeStr.begin(), (MojInt32) unicodeStr.size(), &status); MojUnicodeErrCheck(status); MojInt32 tokBegin = -1; MojInt32 pos = ubrk_first(ubrk.get()); while (pos != UBRK_DONE) { UWordBreak status = (UWordBreak) ubrk_getRuleStatus(ubrk.get()); if (status != UBRK_WORD_NONE) { MojAssert(tokBegin != -1); MojDbKey key; const UChar* tokChars = unicodeStr.begin() + tokBegin; MojSize tokSize = (MojSize) (pos - tokBegin); if (collator) { err = collator->sortKey(tokChars, tokSize, key); MojErrCheck(err); } else { MojString tok; err = MojDbTextUtils::unicodeToStr(tokChars, tokSize, tok); MojErrCheck(err); err = key.assign(tok); MojErrCheck(err); } err = keysOut.put(key); MojErrCheck(err); } tokBegin = pos; pos = ubrk_next(ubrk.get()); } return MojErrNone; }
// BreakIterator.index {{{ static PyObject * icu_BreakIterator_index(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif UChar *buf = NULL; int32_t prev = 0, p = 0, sz = 0, tsz = 0, ans = -1; PyObject *token = NULL; if (!PyArg_ParseTuple(args, "O", &token)) return NULL; buf = python_to_icu(token, &sz, 1); if (buf == NULL) return NULL; if (sz < 1) goto end; Py_BEGIN_ALLOW_THREADS; p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word tsz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz == tsz && memcmp(self->text + prev, buf, sz * sizeof(UChar)) == 0) { #ifdef PY_UNICODE_WIDE ans = u_countChar32(self->text, prev); #else ans = prev; #endif break; } } Py_END_ALLOW_THREADS; end: free(buf); return Py_BuildValue("i", ans); } // }}}
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args, PyObject *kwargs) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t prev = 0, p = 0, sz = 0; PyObject *ans = NULL, *temp = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { prev = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - prev : p - prev; if (sz > 0) { #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + prev, sz); prev = u_countChar32(self->text, prev); #endif temp = Py_BuildValue("II", prev, sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } return ans; } // }}}
// BreakIterator.split2 {{{ static PyObject * icu_BreakIterator_split2(icu_BreakIterator *self, PyObject *args) { #if PY_VERSION_HEX >= 0x03030000 #error Not implemented for python >= 3.3 #endif int32_t word_start = 0, p = 0, sz = 0, last_pos = 0, last_sz = 0; int is_hyphen_sep = 0, leading_hyphen = 0, trailing_hyphen = 0; UChar sep = 0; PyObject *ans = NULL, *temp = NULL, *t = NULL; ans = PyList_New(0); if (ans == NULL) return PyErr_NoMemory(); p = ubrk_first(self->break_iterator); while (p != UBRK_DONE) { word_start = p; p = ubrk_next(self->break_iterator); if (self->type == UBRK_WORD && ubrk_getRuleStatus(self->break_iterator) == UBRK_WORD_NONE) continue; // We are not at the start of a word sz = (p == UBRK_DONE) ? self->text_len - word_start : p - word_start; if (sz > 0) { // ICU breaks on words containing hyphens, we do not want that, so we recombine manually is_hyphen_sep = 0; leading_hyphen = 0; trailing_hyphen = 0; if (word_start > 0) { // Look for a leading hyphen sep = *(self->text + word_start - 1); if (IS_HYPHEN_CHAR(sep)) { leading_hyphen = 1; if (last_pos > 0 && word_start - last_pos == 1) is_hyphen_sep = 1; } } if (word_start + sz < self->text_len) { // Look for a trailing hyphen sep = *(self->text + word_start + sz); if (IS_HYPHEN_CHAR(sep)) trailing_hyphen = 1; } last_pos = p; #ifdef Py_UNICODE_WIDE sz = u_countChar32(self->text + word_start, sz); word_start = u_countChar32(self->text, word_start); #endif if (is_hyphen_sep && PyList_GET_SIZE(ans) > 0) { sz = last_sz + sz + trailing_hyphen; last_sz = sz; t = PyInt_FromLong((long)sz); if (t == NULL) { Py_DECREF(ans); ans = NULL; break; } temp = PyList_GET_ITEM(ans, PyList_GET_SIZE(ans) - 1); Py_DECREF(PyTuple_GET_ITEM(temp, 1)); PyTuple_SET_ITEM(temp, 1, t); } else { sz += leading_hyphen + trailing_hyphen; last_sz = sz; temp = Py_BuildValue("ll", (long)(word_start - leading_hyphen), (long)sz); if (temp == NULL) { Py_DECREF(ans); ans = NULL; break; } if (PyList_Append(ans, temp) != 0) { Py_DECREF(temp); Py_DECREF(ans); ans = NULL; break; } Py_DECREF(temp); } } } return ans; } // }}}
bool isWordTextBreak(TextBreakIterator* iterator) { int ruleStatus = ubrk_getRuleStatus(reinterpret_cast<UBreakIterator*>(iterator)); return ruleStatus != UBRK_WORD_NONE; }
bool isWordTextBreak(UBreakIterator* iterator) { int ruleStatus = ubrk_getRuleStatus(iterator); return ruleStatus != UBRK_WORD_NONE; }
static engine_return_t engine_fixed_match(error_t **error, void *data, const UString *subject) { int32_t ret; UErrorCode status; FETCH_DATA(data, p, fixed_pattern_t); status = U_ZERO_ERROR; if (ustring_empty(p->pattern)) { if (IS_WORD_BOUNDED(p->flags)) { if (ustring_empty(subject)) { return ENGINE_MATCH_FOUND; } else { int32_t l, u, lastState, state; ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } if (UBRK_DONE != (l = ubrk_first(p->ubrk))) { lastState = ubrk_getRuleStatus(p->ubrk); while (UBRK_DONE != (u = ubrk_next(p->ubrk))) { state = ubrk_getRuleStatus(p->ubrk); if (UBRK_WORD_NONE == lastState && lastState == state) { return ENGINE_MATCH_FOUND; } lastState = state; l = u; } } return ENGINE_NO_MATCH; } } else { return ENGINE_MATCH_FOUND; } } else if (NULL != p->usearch) { if (subject->len > 0) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } ret = usearch_first(p->usearch, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_first"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (ret != USEARCH_DONE ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } else { return ENGINE_NO_MATCH; } } else { UChar *m; int32_t pos; pos = 0; ret = ENGINE_NO_MATCH; if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } } while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) { pos = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) { ret = ENGINE_MATCH_FOUND; } pos += p->pattern->len; } ubrk_unbindText(p->ubrk); return ret; } }
static engine_return_t engine_fixed_match_all(error_t **error, void *data, const UString *subject, interval_list_t *intervals) { int32_t matches; UErrorCode status; FETCH_DATA(data, p, fixed_pattern_t); matches = 0; status = U_ZERO_ERROR; if (ustring_empty(p->pattern)) { if (IS_WORD_BOUNDED(p->flags)) { if (ustring_empty(subject)) { return ENGINE_MATCH_FOUND; } else { int32_t l, u, lastState, state; ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } if (UBRK_DONE != (l = ubrk_first(p->ubrk))) { lastState = ubrk_getRuleStatus(p->ubrk); while (UBRK_DONE != (u = ubrk_next(p->ubrk))) { state = ubrk_getRuleStatus(p->ubrk); if (UBRK_WORD_NONE == lastState && lastState == state) { return ENGINE_MATCH_FOUND; } lastState = state; l = u; } } return ENGINE_NO_MATCH; } } else { return ENGINE_MATCH_FOUND; } } else if (NULL != p->usearch) { int32_t l, u; if (subject->len > 0) { usearch_setText(p->usearch, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_setText"); return ENGINE_FAILURE; } for (l = usearch_first(p->usearch, &status); U_SUCCESS(status) && USEARCH_DONE != l; l = usearch_next(p->usearch, &status)) { matches++; u = l + usearch_getMatchedLength(p->usearch); if (interval_list_add(intervals, subject->len, l, u)) { return ENGINE_WHOLE_LINE_MATCH; } } if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "usearch_[first|next]"); return ENGINE_FAILURE; } usearch_unbindText(p->usearch); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } else { return ENGINE_NO_MATCH; } } else { UChar *m; int32_t pos; pos = 0; if (NULL != p->ubrk) { ubrk_setText(p->ubrk, subject->ptr, subject->len, &status); if (U_FAILURE(status)) { icu_error_set(error, FATAL, status, "ubrk_setText"); return ENGINE_FAILURE; } } while (NULL != (m = u_strFindFirst(subject->ptr + pos, subject->len - pos, p->pattern->ptr, p->pattern->len))) { pos = m - subject->ptr; if (NULL == p->ubrk || (ubrk_isBoundary(p->ubrk, pos) && ubrk_isBoundary(p->ubrk, pos + p->pattern->len))) { matches++; if (interval_list_add(intervals, subject->len, pos, pos + p->pattern->len)) { return ENGINE_WHOLE_LINE_MATCH; } } pos += p->pattern->len; } ubrk_unbindText(p->ubrk); return (matches ? ENGINE_MATCH_FOUND : ENGINE_NO_MATCH); } }
int icu_breakpoints(lua_State *L) { const char* input = luaL_checkstring(L, 1); int input_l = strlen(input); const char* locale = luaL_checkstring(L, 2); UChar *buffer; int32_t l, breakcount = 0; UErrorCode err = U_ZERO_ERROR; u_strFromUTF8(NULL, 0, &l, input, input_l, &err); /* Above call returns an error every time. */ err = U_ZERO_ERROR; buffer = malloc(l * sizeof(UChar)); u_strFromUTF8(buffer, l, &l, input, input_l, &err); UBreakIterator* wordbreaks, *linebreaks; int32_t i, previous; wordbreaks = ubrk_open(UBRK_WORD, locale, buffer, l, &err); if(U_FAILURE(err)) { luaL_error(L, "Word break parser failure: %s", u_errorName(err)); } linebreaks = ubrk_open(UBRK_LINE, locale, buffer, l, &err); if(U_FAILURE(err)) { luaL_error(L, "Line break parser failure: %s", u_errorName(err)); } previous = 0; i = 0; while (i <= l) { int32_t out_l; int32_t type; if (!ubrk_isBoundary(linebreaks, i) && !ubrk_isBoundary(wordbreaks,i)) { i++; continue; } lua_checkstack(L, 3); /* At some kind of boundary */ lua_newtable(L); lua_pushstring(L, "type"); lua_pushstring(L, ubrk_isBoundary(linebreaks,i) ? "line" : "word"); lua_settable(L, -3); int32_t utf8_index = 0; err = U_ZERO_ERROR; u_strToUTF8(NULL, 0, &utf8_index, buffer, i, &err); assert(U_SUCCESS(err) || err == U_BUFFER_OVERFLOW_ERROR); lua_pushstring(L, "index"); lua_pushinteger(L, utf8_index); lua_settable(L, -3); if (ubrk_isBoundary(linebreaks, i)) { lua_pushstring(L, "subtype"); type = ubrk_getRuleStatus(linebreaks); if (type >= UBRK_LINE_SOFT && type < UBRK_LINE_SOFT_LIMIT) { lua_pushstring(L, "soft"); } else { lua_pushstring(L, "hard"); } lua_settable(L, -3); } lua_pushstring(L, "token"); lua_pushlstring(L, input+previous, utf8_index-previous); lua_settable(L, -3); previous = utf8_index; breakcount++; i++; } ubrk_close(wordbreaks); ubrk_close(linebreaks); return breakcount; }
int32_t __hs_ubrk_getRuleStatus(UBreakIterator *bi) { return ubrk_getRuleStatus(bi); }